From efa313addcd2864e9c045262f2e2f080497e3b62 Mon Sep 17 00:00:00 2001
From: "Md. Kishor Morol" <kishoremorol@gmail.com>
Date: Sat, 6 Jun 2026 19:04:59 -0400
Subject: [PATCH 1/2] feat: cap site to 1000/section + split HF dataset by
 source

Website:
- Cap each browse section (arXiv / conference / journal) to 1,000 papers
  (3,000 total). _queryPapers now clamps the reported count to SECTION_CAP
  and trims rows past the cap, so pagination stops at 1,000 per section.
  The full corpus stays available via the API and the HF dataset; the
  static JSON was already capped at 1,000.

HF dataset:
- Split the `papers` config by source into `arxiv`, `conference`, and
  `journal` splits (uploaded as papers_<source>.jsonl), alongside the
  existing combined `train` split (kept for backward compatibility).
  Users can now `load_dataset(repo, "papers", split="journal")`.
- Card updated: per-source counts in stats, file table, and usage examples.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 site/assets/js/railway-api.js | 25 +++++++++++----
 src/storage/hf_dataset.py     | 58 +++++++++++++++++++++++++++++++----
 2 files changed, 71 insertions(+), 12 deletions(-)
diff --git a/site/assets/js/railway-api.js b/site/assets/js/railway-api.js
index 9a70a68..7bb7a6e 100644
--- a/site/assets/js/railway-api.js
+++ b/site/assets/js/railway-api.js
@@ -11,6 +11,11 @@
 
 const RS_API = 'https://researchscope-production.up.railway.app';
 
+// Public site shows at most this many papers per section (arXiv / conference /
+// journal) — 3 000 total. The full corpus stays available via the API and the
+// Hugging Face dataset; this just bounds what the browse pages paginate through.
+const SECTION_CAP = 1000;
+
 // ── Auth state ────────────────────────────────────────────────────────────────
 
 const _auth = {
@@ -67,25 +72,33 @@ async function _queryPapers({
   else if (source === 'conference') params.set('source_type', 'conference');
   else if (source === 'journal')    params.set('source_type', 'journal');
 
-  // 1. Try Railway
+  const start = (page - 1) * pageSize;
+
+  // 1. Try Railway — clamp the reported count and trim rows past the cap so the
+  // browse page paginates through at most SECTION_CAP papers for this section.
   try {
     const json = await _apiFetch(`/papers?${params}`);
-    if (json && Array.isArray(json.results))
-      return { data: json.results, count: json.total ?? 0, error: null };
+    if (json && Array.isArray(json.results)) {
+      const count = Math.min(json.total ?? 0, SECTION_CAP);
+      let data = json.results;
+      if (start >= SECTION_CAP) data = [];
+      else if (start + data.length > SECTION_CAP) data = data.slice(0, SECTION_CAP - start);
+      return { data, count, error: null };
+    }
   } catch (e) {
     console.warn('[railway] queryPapers failed, falling back to static JSON:', e.message);
   }
 
-  // 2. Last resort — static JSON
+  // 2. Last resort — static JSON (already capped at 1 000 by the generator)
   try {
     const res = await fetch('data/papers.json');
     const all = await res.json();
-    const start = (page - 1) * pageSize;
     const filtered = search
       ? all.filter(p => (p.title||'').toLowerCase().includes(search.toLowerCase()) ||
                         (p.abstract||'').toLowerCase().includes(search.toLowerCase()))
       : all;
-    return { data: filtered.slice(start, start + pageSize), count: filtered.length, error: null };
+    const count = Math.min(filtered.length, SECTION_CAP);
+    return { data: filtered.slice(start, Math.min(start + pageSize, SECTION_CAP)), count, error: null };
   } catch (e) {
     console.warn('[static] papers.json failed:', e.message);
   }
diff --git a/src/storage/hf_dataset.py b/src/storage/hf_dataset.py
index fa7cbf3..ddc8c73 100644
--- a/src/storage/hf_dataset.py
+++ b/src/storage/hf_dataset.py
@@ -91,6 +91,17 @@ def _to_raw(paper: dict) -> dict:
     return out
 
 
+def _bucket(row: dict) -> str:
+    """Classify a paper into arxiv / conference / journal for per-source splits."""
+    st = str(row.get("source_type") or "").lower()
+    if st == "journal":
+        return "journal"
+    if st in ("conference", "workshop"):
+        return "conference"
+    # preprint / unknown → treat as arXiv (source is arxiv/openalex preprints)
+    return "arxiv"
+
+
 def _to_instruct_rows(paper: dict) -> list[dict]:
     rows = []
     inp  = _input_text(paper)
@@ -191,7 +202,7 @@ def push(papers: list[dict] | None = None) -> bool:
 
     today = datetime.now(timezone.utc).date()
 
-    # ── Raw split ─────────────────────────────────────────────────────────────
+    # ── Raw split — combined `train` + per-source arxiv/conference/journal ──────
     raw_rows = [_to_raw(p) for p in papers if p.get("title")]
     log.info("[hf] pushing %d raw paper records …", len(raw_rows))
     _upload_with_retry(
@@ -201,6 +212,21 @@ def push(papers: list[dict] | None = None) -> bool:
         commit_message=f"update papers.jsonl ({len(raw_rows):,} papers) [{today}]",
     )
 
+    # Per-source splits so users can load just arXiv, conference, or journal
+    # papers — `load_dataset(repo, "papers", split="journal")`. The combined
+    # `train` split above stays for backward compatibility.
+    buckets: dict[str, list[dict]] = {"arxiv": [], "conference": [], "journal": []}
+    for row in raw_rows:
+        buckets[_bucket(row)].append(row)
+    for name, rows in buckets.items():
+        log.info("[hf]   %s split → %d papers", name, len(rows))
+        _upload_with_retry(
+            api,
+            path_or_fileobj=_jsonl_bytes(rows),
+            path_in_repo=f"data/papers_{name}.jsonl",
+            commit_message=f"update papers_{name}.jsonl ({len(rows):,} papers) [{today}]",
+        )
+
     # ── Instruction split ─────────────────────────────────────────────────────
     instruct_rows = []
     for p in papers:
@@ -214,7 +240,8 @@ def push(papers: list[dict] | None = None) -> bool:
     )
 
     # ── Dataset card ──────────────────────────────────────────────────────────
-    _push_card(api, len(raw_rows), len(instruct_rows))
+    _push_card(api, len(raw_rows), len(instruct_rows),
+               {k: len(v) for k, v in buckets.items()})
 
     log.info("[hf] push complete → https://huggingface.co/datasets/%s", _REPO_ID)
     return True
@@ -299,7 +326,12 @@ def _ensure_sections_config(api: Any) -> None:
     log.info("[hf] registered sections config in dataset card.")
 
 
-def _push_card(api: Any, n_papers: int, n_instruct: int) -> None:
+def _push_card(api: Any, n_papers: int, n_instruct: int,
+               by_source: dict[str, int] | None = None) -> None:
+    by_source = by_source or {}
+    n_arxiv = by_source.get("arxiv", 0)
+    n_conf  = by_source.get("conference", 0)
+    n_journal = by_source.get("journal", 0)
     card = f"""---
 license: cc-by-4.0
 language:
@@ -323,6 +355,12 @@ def _push_card(api: Any, n_papers: int, n_instruct: int) -> None:
     data_files:
       - split: train
         path: data/papers.jsonl
+      - split: arxiv
+        path: data/papers_arxiv.jsonl
+      - split: conference
+        path: data/papers_conference.jsonl
+      - split: journal
+        path: data/papers_journal.jsonl
   - config_name: instruct
     data_files:
       - split: train
@@ -341,7 +379,7 @@ def _push_card(api: Any, n_papers: int, n_instruct: int) -> None:
 
 ## Stats
 
-- **{n_papers:,}** papers (raw metadata)
+- **{n_papers:,}** papers (raw metadata) — **{n_arxiv:,}** arXiv · **{n_conf:,}** conference · **{n_journal:,}** journal
 - **{n_instruct:,}** instruction-tuning rows
 - Sources: arXiv, OpenAlex, ACL Anthology, OpenReview, PMLR, CVF, Semantic Scholar
 - Venues: NeurIPS, ICML, ICLR, ACL, EMNLP, CVPR, AAAI, IJCAI, JMLR, TMLR, TACL, TPAMI, NMI and more
@@ -350,7 +388,10 @@ def _push_card(api: Any, n_papers: int, n_instruct: int) -> None:
 
 | File | Description |
 |------|-------------|
-| `data/papers.jsonl` | Raw paper metadata — title, abstract, authors, venue, year, tags, scores |
+| `data/papers.jsonl` | Raw paper metadata — title, abstract, authors, venue, year, tags, scores (all sources combined) |
+| `data/papers_arxiv.jsonl` | arXiv / preprint papers only |
+| `data/papers_conference.jsonl` | Conference papers only (NeurIPS, ICML, ICLR, ACL, CVPR, …) |
+| `data/papers_journal.jsonl` | Journal papers only (JMLR, TPAMI, NMI, TACL, …) |
 | `data/instruct.jsonl` | Instruction-tuning pairs — summarize, key contribution, why it matters, plain English |
 | `data/sections.jsonl` | Per-section fine-tuning rows for A* papers — real body text of `abstract`, `introduction`, `related_work`, `method`, `experiments`, `results`, `conclusion`. Filter by the `section` field to train a per-section writing agent. |
 
@@ -359,9 +400,14 @@ def _push_card(api: Any, n_papers: int, n_instruct: int) -> None:
 ```python
 from datasets import load_dataset
 
-# Raw papers
+# All papers (combined)
 papers = load_dataset("kishormorol/researchscope-papers", "papers", split="train")
 
+# Just one source — arXiv, conference, or journal papers
+arxiv      = load_dataset("kishormorol/researchscope-papers", "papers", split="arxiv")
+conference = load_dataset("kishormorol/researchscope-papers", "papers", split="conference")
+journal    = load_dataset("kishormorol/researchscope-papers", "papers", split="journal")
+
 # Instruction tuning
 instruct = load_dataset("kishormorol/researchscope-papers", "instruct", split="train")
 

From 59d674b127f416da975a6ecae216c65fc25d4db3 Mon Sep 17 00:00:00 2001
From: "Md. Kishor Morol" <kishoremorol@gmail.com>
Date: Sat, 6 Jun 2026 19:08:57 -0400
Subject: [PATCH 2/2] =?UTF-8?q?fix(site):=20correct=20stale=2083K=20paper?=
 =?UTF-8?q?=20count=20=E2=86=92=20live/100K+?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The homepage hero claimed "83,000+ papers" — long stale (the corpus is now
100K+; the Railway API holds ~165K). Fixes:

- Hero count is now dynamic: loadStats() injects the live total from
  stats.json, rounded down to a clean "N,000+", so it never goes stale again
  (static fallback "100,000+" before JS loads).
- Replace remaining hardcoded "83K+"/"83,000+" copy with "100,000+" across
  README, the index feature card, and the sign-in/register pages.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 README.md             | 10 +++++-----
 site/assets/js/app.js |  6 ++++++
 site/index.html       |  4 ++--
 site/register.html    |  2 +-
 site/signin.html      |  2 +-
 5 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index b961e29..b580800 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
 
 # ResearchScope
 
-**CS Research Intelligence Platform — 83,000+ papers, scored, ranked, and searchable.**
+**CS Research Intelligence Platform — 100,000+ papers, scored, ranked, and searchable.**
 
 Stop skimming paper lists. ResearchScope scores papers by impact, surfaces research gaps, recommends venues, and tracks who's driving the frontier — updated daily.
 
@@ -40,7 +40,7 @@ The frontend is a static site on GitHub Pages backed by a **FastAPI REST API** o
 |---|---|
 | **Jun 2026** | **OpenReview Acceptance Tiers** — oral/spotlight/poster signals captured for ICLR, NeurIPS, ICML & COLM; oral/spotlight boost paper scores and show as badges. Coverage extended through ICLR 2026, NeurIPS 2025, ICML 2025 |
 | **Jun 2026** | **Journal Recommender** — paste title + abstract to match against 20 Q1 journals (JMLR, TPAMI, Nature MI, CSUR…) with impact factor, review timeline, and open access info |
-| **Jun 2026** | **FastAPI Backend on Railway** — full REST API with JWT auth, favourites, PostgreSQL full-text search (83K+ papers). User accounts synced across devices |
+| **Jun 2026** | **FastAPI Backend on Railway** — full REST API with JWT auth, favourites, PostgreSQL full-text search (100K+ papers). User accounts synced across devices |
 | **Jun 2026** | **OpenAlex Integration** — 250M+ work catalogue added as a data source, covering ML/NLP/CV/IR concept groups |
 | **Jun 2026** | **HuggingFace Training Dataset** — `kishormorol/researchscope-papers` auto-pushed after every pipeline run: raw metadata JSONL + instruction-tuning pairs |
 | **Jun 2026** | **20 Q1 Journals** — JMLR, TMLR, TACL, TPAMI, IJCV, AIJ, TNNLS, Nature MI, CSUR, TIP, MLJ, TKDE, DAMI, NN, PR, CL, IPM, JACM, NatComms, TOIS |
@@ -54,11 +54,11 @@ The frontend is a static site on GitHub Pages backed by a **FastAPI REST API** o
 
 | Feature | Description |
 |---|---|
-| 📄 **83K+ papers** | Scored by recency, venue rank, acceptance tier (oral/spotlight), novelty, author prestige, and citation quality |
+| 📄 **100K+ papers** | Scored by recency, venue rank, acceptance tier (oral/spotlight), novelty, author prestige, and citation quality |
 | 🎓 **A* Conference coverage** | NeurIPS, ICML, ICLR, CVPR, ACL, EMNLP, AAAI, IJCAI, CHI, SIGIR, WWW, KDD and more |
 | 📖 **20 Q1 Journals** | JMLR, TMLR, TACL, TPAMI, Nature MI, and 15 more — with IF, review time, OA status |
 | 🎯 **Venue Recommenders** | Conference + Journal recommenders: paste abstract → ranked matches with expectations |
-| 🔍 **Full-text search** | PostgreSQL `tsvector` search across 83K papers via Railway API |
+| 🔍 **Full-text search** | PostgreSQL `tsvector` search across 100K+ papers via Railway API |
 | 👤 **User accounts** | JWT auth, favourites synced across devices via Railway backend |
 | 🕳 **Research gaps** | 3-layer extraction: explicit, pattern-detected, and starter ideas |
 | 👩‍🔬 **Author intelligence** | 5,000+ researchers ranked by momentum score |
@@ -134,7 +134,7 @@ The paper dataset is published on HuggingFace and auto-updated after every pipel
 ```python
 from datasets import load_dataset
 
-# 83K+ raw paper records (pretraining / RAG)
+# 100K+ raw paper records (pretraining / RAG)
 papers = load_dataset("kishormorol/researchscope-papers",
                       data_files="data/papers.jsonl", split="train")
 
diff --git a/site/assets/js/app.js b/site/assets/js/app.js
index 925c6d2..0380847 100644
--- a/site/assets/js/app.js
+++ b/site/assets/js/app.js
@@ -204,6 +204,12 @@ async function loadStats() {
     const el = document.getElementById(id);
     if (el) el.textContent = (val ?? 0).toLocaleString();
   }
+  // Hero tagline count — rounded down to a clean "N,000+" so it never goes stale.
+  const heroEl = document.getElementById('hero-paper-count');
+  if (heroEl && stats.total_papers) {
+    const rounded = Math.floor(stats.total_papers / 1000) * 1000;
+    heroEl.textContent = rounded.toLocaleString() + '+';
+  }
   const genEl = document.getElementById('stat-generated');
   if (genEl && stats.generated_at) {
     genEl.textContent = 'Updated ' + new Date(stats.generated_at).toLocaleDateString('en-US', { month:'short', day:'numeric', year:'numeric' });
diff --git a/site/index.html b/site/index.html
index 94950cc..895d9f2 100644
--- a/site/index.html
+++ b/site/index.html
@@ -137,7 +137,7 @@ <h1 style="font-size:clamp(2rem,6vw,3.75rem);font-weight:900;letter-spacing:-.03
     </h1>
 
     <p style="font-size:clamp(.95rem,2vw,1.15rem);opacity:.82;max-width:38rem;margin:0 auto 2.25rem;line-height:1.65">
-      83,000+ papers scored by impact · Conferences &amp; journals ranked ·
+      <span id="hero-paper-count">100,000+</span> papers scored by impact · Conferences &amp; journals ranked ·
       Research gaps surfaced · Find where to submit your next paper
     </p>
 
@@ -208,7 +208,7 @@ <h1 style="font-size:clamp(2rem,6vw,3.75rem);font-weight:900;letter-spacing:-.03
            style="background:linear-gradient(135deg,rgba(79,70,229,.12),rgba(99,102,241,.08));border:1px solid rgba(79,70,229,.15)">📄</div>
       <div>
         <h3 class="font-bold text-sm mb-1" style="letter-spacing:-.01em">Paper Browser</h3>
-        <p class="text-xs leading-relaxed" style="color:var(--rs-muted)">83K+ papers scored by impact, novelty, and venue rank. Filter by topic, year, difficulty, or source.</p>
+        <p class="text-xs leading-relaxed" style="color:var(--rs-muted)">Papers scored by impact, novelty, and venue rank. Filter by topic, year, difficulty, or source.</p>
       </div>
     </a>
 
diff --git a/site/register.html b/site/register.html
index 47e7361..15e927f 100644
--- a/site/register.html
+++ b/site/register.html
@@ -123,7 +123,7 @@ <h2 style="font-size:1.15rem;font-weight:800;margin:0 0 .25rem;letter-spacing:-.
           <span style="font-size:.95rem;flex-shrink:0">⭐</span>Save &amp; sync favourite papers
         </li>
         <li style="font-size:.82rem;opacity:.9;display:flex;align-items:center;gap:.6rem;line-height:1.4">
-          <span style="font-size:.95rem;flex-shrink:0">🔍</span>Full-text search across 83K+ papers
+          <span style="font-size:.95rem;flex-shrink:0">🔍</span>Full-text search across 100K+ papers
         </li>
         <li style="font-size:.82rem;opacity:.9;display:flex;align-items:center;gap:.6rem;line-height:1.4">
           <span style="font-size:.95rem;flex-shrink:0">🎓</span>Conference &amp; journal recommender
diff --git a/site/signin.html b/site/signin.html
index 63b0c42..e62c25f 100644
--- a/site/signin.html
+++ b/site/signin.html
@@ -122,7 +122,7 @@ <h2 style="font-size:1.15rem;font-weight:800;margin:0 0 .25rem;letter-spacing:-.
           <span style="font-size:.95rem;flex-shrink:0">⭐</span>Save &amp; sync favourite papers
         </li>
         <li style="font-size:.82rem;opacity:.9;display:flex;align-items:center;gap:.6rem;line-height:1.4">
-          <span style="font-size:.95rem;flex-shrink:0">🔍</span>Full-text search across 83K+ papers
+          <span style="font-size:.95rem;flex-shrink:0">🔍</span>Full-text search across 100K+ papers
         </li>
         <li style="font-size:.82rem;opacity:.9;display:flex;align-items:center;gap:.6rem;line-height:1.4">
           <span style="font-size:.95rem;flex-shrink:0">🎓</span>Conference &amp; journal recommender