From efa313addcd2864e9c045262f2e2f080497e3b62 Mon Sep 17 00:00:00 2001
From: "Md. Kishor Morol"
Date: Sat, 6 Jun 2026 19:04:59 -0400
Subject: [PATCH 1/2] feat: cap site to 1000/section + split HF dataset by
source
Website:
- Cap each browse section (arXiv / conference / journal) to 1,000 papers
(3,000 total). _queryPapers now clamps the reported count to SECTION_CAP
and trims rows past the cap, so pagination stops at 1,000 per section.
The full corpus stays available via the API and the HF dataset; the
static JSON was already capped at 1,000.
HF dataset:
- Split the `papers` config by source into `arxiv`, `conference`, and
`journal` splits (uploaded as papers_.jsonl), alongside the
existing combined `train` split (kept for backward compatibility).
Users can now `load_dataset(repo, "papers", split="journal")`.
- Card updated: per-source counts in stats, file table, and usage examples.
Co-Authored-By: Claude Opus 4.8
---
site/assets/js/railway-api.js | 25 +++++++++++----
src/storage/hf_dataset.py | 58 +++++++++++++++++++++++++++++++----
2 files changed, 71 insertions(+), 12 deletions(-)
diff --git a/site/assets/js/railway-api.js b/site/assets/js/railway-api.js
index 9a70a68..7bb7a6e 100644
--- a/site/assets/js/railway-api.js
+++ b/site/assets/js/railway-api.js
@@ -11,6 +11,11 @@
const RS_API = 'https://researchscope-production.up.railway.app';
+// Public site shows at most this many papers per section (arXiv / conference /
+// journal) — 3 000 total. The full corpus stays available via the API and the
+// Hugging Face dataset; this just bounds what the browse pages paginate through.
+const SECTION_CAP = 1000;
+
// ── Auth state ────────────────────────────────────────────────────────────────
const _auth = {
@@ -67,25 +72,33 @@ async function _queryPapers({
else if (source === 'conference') params.set('source_type', 'conference');
else if (source === 'journal') params.set('source_type', 'journal');
- // 1. Try Railway
+ const start = (page - 1) * pageSize;
+
+ // 1. Try Railway — clamp the reported count and trim rows past the cap so the
+ // browse page paginates through at most SECTION_CAP papers for this section.
try {
const json = await _apiFetch(`/papers?${params}`);
- if (json && Array.isArray(json.results))
- return { data: json.results, count: json.total ?? 0, error: null };
+ if (json && Array.isArray(json.results)) {
+ const count = Math.min(json.total ?? 0, SECTION_CAP);
+ let data = json.results;
+ if (start >= SECTION_CAP) data = [];
+ else if (start + data.length > SECTION_CAP) data = data.slice(0, SECTION_CAP - start);
+ return { data, count, error: null };
+ }
} catch (e) {
console.warn('[railway] queryPapers failed, falling back to static JSON:', e.message);
}
- // 2. Last resort — static JSON
+ // 2. Last resort — static JSON (already capped at 1 000 by the generator)
try {
const res = await fetch('data/papers.json');
const all = await res.json();
- const start = (page - 1) * pageSize;
const filtered = search
? all.filter(p => (p.title||'').toLowerCase().includes(search.toLowerCase()) ||
(p.abstract||'').toLowerCase().includes(search.toLowerCase()))
: all;
- return { data: filtered.slice(start, start + pageSize), count: filtered.length, error: null };
+ const count = Math.min(filtered.length, SECTION_CAP);
+ return { data: filtered.slice(start, Math.min(start + pageSize, SECTION_CAP)), count, error: null };
} catch (e) {
console.warn('[static] papers.json failed:', e.message);
}
diff --git a/src/storage/hf_dataset.py b/src/storage/hf_dataset.py
index fa7cbf3..ddc8c73 100644
--- a/src/storage/hf_dataset.py
+++ b/src/storage/hf_dataset.py
@@ -91,6 +91,17 @@ def _to_raw(paper: dict) -> dict:
return out
+def _bucket(row: dict) -> str:
+ """Classify a paper into arxiv / conference / journal for per-source splits."""
+ st = str(row.get("source_type") or "").lower()
+ if st == "journal":
+ return "journal"
+ if st in ("conference", "workshop"):
+ return "conference"
+ # preprint / unknown → treat as arXiv (source is arxiv/openalex preprints)
+ return "arxiv"
+
+
def _to_instruct_rows(paper: dict) -> list[dict]:
rows = []
inp = _input_text(paper)
@@ -191,7 +202,7 @@ def push(papers: list[dict] | None = None) -> bool:
today = datetime.now(timezone.utc).date()
- # ── Raw split ─────────────────────────────────────────────────────────────
+ # ── Raw split — combined `train` + per-source arxiv/conference/journal ──────
raw_rows = [_to_raw(p) for p in papers if p.get("title")]
log.info("[hf] pushing %d raw paper records …", len(raw_rows))
_upload_with_retry(
@@ -201,6 +212,21 @@ def push(papers: list[dict] | None = None) -> bool:
commit_message=f"update papers.jsonl ({len(raw_rows):,} papers) [{today}]",
)
+ # Per-source splits so users can load just arXiv, conference, or journal
+ # papers — `load_dataset(repo, "papers", split="journal")`. The combined
+ # `train` split above stays for backward compatibility.
+ buckets: dict[str, list[dict]] = {"arxiv": [], "conference": [], "journal": []}
+ for row in raw_rows:
+ buckets[_bucket(row)].append(row)
+ for name, rows in buckets.items():
+ log.info("[hf] %s split → %d papers", name, len(rows))
+ _upload_with_retry(
+ api,
+ path_or_fileobj=_jsonl_bytes(rows),
+ path_in_repo=f"data/papers_{name}.jsonl",
+ commit_message=f"update papers_{name}.jsonl ({len(rows):,} papers) [{today}]",
+ )
+
# ── Instruction split ─────────────────────────────────────────────────────
instruct_rows = []
for p in papers:
@@ -214,7 +240,8 @@ def push(papers: list[dict] | None = None) -> bool:
)
# ── Dataset card ──────────────────────────────────────────────────────────
- _push_card(api, len(raw_rows), len(instruct_rows))
+ _push_card(api, len(raw_rows), len(instruct_rows),
+ {k: len(v) for k, v in buckets.items()})
log.info("[hf] push complete → https://huggingface.co/datasets/%s", _REPO_ID)
return True
@@ -299,7 +326,12 @@ def _ensure_sections_config(api: Any) -> None:
log.info("[hf] registered sections config in dataset card.")
-def _push_card(api: Any, n_papers: int, n_instruct: int) -> None:
+def _push_card(api: Any, n_papers: int, n_instruct: int,
+ by_source: dict[str, int] | None = None) -> None:
+ by_source = by_source or {}
+ n_arxiv = by_source.get("arxiv", 0)
+ n_conf = by_source.get("conference", 0)
+ n_journal = by_source.get("journal", 0)
card = f"""---
license: cc-by-4.0
language:
@@ -323,6 +355,12 @@ def _push_card(api: Any, n_papers: int, n_instruct: int) -> None:
data_files:
- split: train
path: data/papers.jsonl
+ - split: arxiv
+ path: data/papers_arxiv.jsonl
+ - split: conference
+ path: data/papers_conference.jsonl
+ - split: journal
+ path: data/papers_journal.jsonl
- config_name: instruct
data_files:
- split: train
@@ -341,7 +379,7 @@ def _push_card(api: Any, n_papers: int, n_instruct: int) -> None:
## Stats
-- **{n_papers:,}** papers (raw metadata)
+- **{n_papers:,}** papers (raw metadata) — **{n_arxiv:,}** arXiv · **{n_conf:,}** conference · **{n_journal:,}** journal
- **{n_instruct:,}** instruction-tuning rows
- Sources: arXiv, OpenAlex, ACL Anthology, OpenReview, PMLR, CVF, Semantic Scholar
- Venues: NeurIPS, ICML, ICLR, ACL, EMNLP, CVPR, AAAI, IJCAI, JMLR, TMLR, TACL, TPAMI, NMI and more
@@ -350,7 +388,10 @@ def _push_card(api: Any, n_papers: int, n_instruct: int) -> None:
| File | Description |
|------|-------------|
-| `data/papers.jsonl` | Raw paper metadata — title, abstract, authors, venue, year, tags, scores |
+| `data/papers.jsonl` | Raw paper metadata — title, abstract, authors, venue, year, tags, scores (all sources combined) |
+| `data/papers_arxiv.jsonl` | arXiv / preprint papers only |
+| `data/papers_conference.jsonl` | Conference papers only (NeurIPS, ICML, ICLR, ACL, CVPR, …) |
+| `data/papers_journal.jsonl` | Journal papers only (JMLR, TPAMI, NMI, TACL, …) |
| `data/instruct.jsonl` | Instruction-tuning pairs — summarize, key contribution, why it matters, plain English |
| `data/sections.jsonl` | Per-section fine-tuning rows for A* papers — real body text of `abstract`, `introduction`, `related_work`, `method`, `experiments`, `results`, `conclusion`. Filter by the `section` field to train a per-section writing agent. |
@@ -359,9 +400,14 @@ def _push_card(api: Any, n_papers: int, n_instruct: int) -> None:
```python
from datasets import load_dataset
-# Raw papers
+# All papers (combined)
papers = load_dataset("kishormorol/researchscope-papers", "papers", split="train")
+# Just one source — arXiv, conference, or journal papers
+arxiv = load_dataset("kishormorol/researchscope-papers", "papers", split="arxiv")
+conference = load_dataset("kishormorol/researchscope-papers", "papers", split="conference")
+journal = load_dataset("kishormorol/researchscope-papers", "papers", split="journal")
+
# Instruction tuning
instruct = load_dataset("kishormorol/researchscope-papers", "instruct", split="train")
From 59d674b127f416da975a6ecae216c65fc25d4db3 Mon Sep 17 00:00:00 2001
From: "Md. Kishor Morol"
Date: Sat, 6 Jun 2026 19:08:57 -0400
Subject: [PATCH 2/2] =?UTF-8?q?fix(site):=20correct=20stale=2083K=20paper?=
=?UTF-8?q?=20count=20=E2=86=92=20live/100K+?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
The homepage hero claimed "83,000+ papers" — long stale (the corpus is now
100K+; the Railway API holds ~165K). Fixes:
- Hero count is now dynamic: loadStats() injects the live total from
stats.json, rounded down to a clean "N,000+", so it never goes stale again
(static fallback "100,000+" before JS loads).
- Replace remaining hardcoded "83K+"/"83,000+" copy with "100,000+" across
README, the index feature card, and the sign-in/register pages.
Co-Authored-By: Claude Opus 4.8
---
README.md | 10 +++++-----
site/assets/js/app.js | 6 ++++++
site/index.html | 4 ++--
site/register.html | 2 +-
site/signin.html | 2 +-
5 files changed, 15 insertions(+), 9 deletions(-)
diff --git a/README.md b/README.md
index b961e29..b580800 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
# ResearchScope
-**CS Research Intelligence Platform — 83,000+ papers, scored, ranked, and searchable.**
+**CS Research Intelligence Platform — 100,000+ papers, scored, ranked, and searchable.**
Stop skimming paper lists. ResearchScope scores papers by impact, surfaces research gaps, recommends venues, and tracks who's driving the frontier — updated daily.
@@ -40,7 +40,7 @@ The frontend is a static site on GitHub Pages backed by a **FastAPI REST API** o
|---|---|
| **Jun 2026** | **OpenReview Acceptance Tiers** — oral/spotlight/poster signals captured for ICLR, NeurIPS, ICML & COLM; oral/spotlight boost paper scores and show as badges. Coverage extended through ICLR 2026, NeurIPS 2025, ICML 2025 |
| **Jun 2026** | **Journal Recommender** — paste title + abstract to match against 20 Q1 journals (JMLR, TPAMI, Nature MI, CSUR…) with impact factor, review timeline, and open access info |
-| **Jun 2026** | **FastAPI Backend on Railway** — full REST API with JWT auth, favourites, PostgreSQL full-text search (83K+ papers). User accounts synced across devices |
+| **Jun 2026** | **FastAPI Backend on Railway** — full REST API with JWT auth, favourites, PostgreSQL full-text search (100K+ papers). User accounts synced across devices |
| **Jun 2026** | **OpenAlex Integration** — 250M+ work catalogue added as a data source, covering ML/NLP/CV/IR concept groups |
| **Jun 2026** | **HuggingFace Training Dataset** — `kishormorol/researchscope-papers` auto-pushed after every pipeline run: raw metadata JSONL + instruction-tuning pairs |
| **Jun 2026** | **20 Q1 Journals** — JMLR, TMLR, TACL, TPAMI, IJCV, AIJ, TNNLS, Nature MI, CSUR, TIP, MLJ, TKDE, DAMI, NN, PR, CL, IPM, JACM, NatComms, TOIS |
@@ -54,11 +54,11 @@ The frontend is a static site on GitHub Pages backed by a **FastAPI REST API** o
| Feature | Description |
|---|---|
-| 📄 **83K+ papers** | Scored by recency, venue rank, acceptance tier (oral/spotlight), novelty, author prestige, and citation quality |
+| 📄 **100K+ papers** | Scored by recency, venue rank, acceptance tier (oral/spotlight), novelty, author prestige, and citation quality |
| 🎓 **A* Conference coverage** | NeurIPS, ICML, ICLR, CVPR, ACL, EMNLP, AAAI, IJCAI, CHI, SIGIR, WWW, KDD and more |
| 📖 **20 Q1 Journals** | JMLR, TMLR, TACL, TPAMI, Nature MI, and 15 more — with IF, review time, OA status |
| 🎯 **Venue Recommenders** | Conference + Journal recommenders: paste abstract → ranked matches with expectations |
-| 🔍 **Full-text search** | PostgreSQL `tsvector` search across 83K papers via Railway API |
+| 🔍 **Full-text search** | PostgreSQL `tsvector` search across 100K+ papers via Railway API |
| 👤 **User accounts** | JWT auth, favourites synced across devices via Railway backend |
| 🕳 **Research gaps** | 3-layer extraction: explicit, pattern-detected, and starter ideas |
| 👩🔬 **Author intelligence** | 5,000+ researchers ranked by momentum score |
@@ -134,7 +134,7 @@ The paper dataset is published on HuggingFace and auto-updated after every pipel
```python
from datasets import load_dataset
-# 83K+ raw paper records (pretraining / RAG)
+# 100K+ raw paper records (pretraining / RAG)
papers = load_dataset("kishormorol/researchscope-papers",
data_files="data/papers.jsonl", split="train")
diff --git a/site/assets/js/app.js b/site/assets/js/app.js
index 925c6d2..0380847 100644
--- a/site/assets/js/app.js
+++ b/site/assets/js/app.js
@@ -204,6 +204,12 @@ async function loadStats() {
const el = document.getElementById(id);
if (el) el.textContent = (val ?? 0).toLocaleString();
}
+ // Hero tagline count — rounded down to a clean "N,000+" so it never goes stale.
+ const heroEl = document.getElementById('hero-paper-count');
+ if (heroEl && stats.total_papers) {
+ const rounded = Math.floor(stats.total_papers / 1000) * 1000;
+ heroEl.textContent = rounded.toLocaleString() + '+';
+ }
const genEl = document.getElementById('stat-generated');
if (genEl && stats.generated_at) {
genEl.textContent = 'Updated ' + new Date(stats.generated_at).toLocaleDateString('en-US', { month:'short', day:'numeric', year:'numeric' });
diff --git a/site/index.html b/site/index.html
index 94950cc..895d9f2 100644
--- a/site/index.html
+++ b/site/index.html
@@ -137,7 +137,7 @@
- 83,000+ papers scored by impact · Conferences & journals ranked ·
+ 100,000+ papers scored by impact · Conferences & journals ranked ·
Research gaps surfaced · Find where to submit your next paper
@@ -208,7 +208,7 @@
📄
Paper Browser
-
83K+ papers scored by impact, novelty, and venue rank. Filter by topic, year, difficulty, or source.
+
Papers scored by impact, novelty, and venue rank. Filter by topic, year, difficulty, or source.