Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 107 additions & 0 deletions .github/workflows/journal-sync.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
name: Journal Sync

env:
FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true

# Fetch ONLY journal papers (OpenAlex by source id, +S2 supplement) and persist
# them. Conference/arXiv rows are preserved — the Railway sync is an upsert and
# the pipeline reloads existing DBs before writing. Manual trigger only; the
# monthly conference-sync already refreshes journals as part of its run.

on:
workflow_dispatch:

permissions:
contents: write
pages: write
id-token: write

concurrency:
group: push-to-main
cancel-in-progress: false

jobs:
journal-sync:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4

- uses: actions/setup-python@v5
with:
python-version: '3.11'
cache: 'pip'
cache-dependency-path: requirements.txt

- run: pip install -r requirements.txt

- name: Sync journal papers (OpenAlex by source id + S2 supplement)
env:
SEMANTIC_SCHOLAR_API_KEY: ${{ secrets.SEMANTIC_SCHOLAR_API_KEY }}
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
RAILWAY_DATABASE_URL: ${{ secrets.RAILWAY_DATABASE_URL }}
HF_TOKEN: ${{ secrets.HF_TOKEN }}
run: |
echo "Running: python src/pipeline.py --journals-only"
echo "Journals: JMLR, TMLR, TPAMI, IJCV, AIJ, TNNLS, NMI, CSUR, TIP, MLJ,"
echo " TKDE, DAMI, NN, PR, CL, IPM, JACM, NatComms, TOIS,"
echo " JAIR, TASLP, VLDBJ, TKDD, JAAMAS, TOG"
python src/pipeline.py --journals-only

- name: Report results
run: |
python -c "
import json, os
path = 'site/data/journals_db.json'
if os.path.exists(path):
db = json.load(open(path))
venues = {}
for p in db:
v = p.get('venue', 'unknown')
venues[v] = venues.get(v, 0) + 1
print(f'Journal papers in journals_db.json: {len(db)}')
for v, n in sorted(venues.items(), key=lambda x: -x[1]):
print(f' {v}: {n}')
else:
print('journals_db.json not found')
"

- name: Build journal recommender data
run: |
python src/sitegen/journal_recommender.py

- name: Commit updated data
run: |
set -euo pipefail
git config user.name "github-actions[bot]"
git config user.email "github-actions[bot]@users.noreply.github.com"
git add site/data/
if git diff --cached --quiet; then
echo "No changes to commit"
else
git commit -m "chore: journal sync [$(date -u +%Y-%m-%d)]"
for i in 1 2 3; do
git push origin main && break
echo "Push attempt $i failed, retrying with rebase (theirs)…"
git pull --rebase --autostash -X theirs origin main
sleep 5
done
fi

deploy:
needs: journal-sync
runs-on: ubuntu-latest
environment:
name: github-pages
url: ${{ steps.deployment.outputs.page_url }}
steps:
- uses: actions/checkout@v4
with:
ref: main
- uses: actions/configure-pages@v4
- uses: actions/upload-pages-artifact@v3
with:
path: site/
- name: Deploy to GitHub Pages
id: deployment
uses: actions/deploy-pages@v4
87 changes: 53 additions & 34 deletions src/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,35 @@ def _load_journal_papers() -> list[Paper]:

# ── Pipeline ──────────────────────────────────────────────────────────────────

def _fetch_journal_papers() -> list[Paper]:
"""Bulk-fetch top CS journals via OpenAlex (keyless, systematic by source id).

OpenAlex is the primary source because the S2 journal search is unreliable
here — it filters by venue *short name* and query text, which yields 0
results and chronic HTTP 429/400s. S2 is only a non-fatal supplement when a
key is configured, for venues OpenAlex under-indexes (e.g. JMLR/TMLR).
Dedup later in the pipeline removes any overlap.
"""
log.info(" [openalex] bulk-fetching top CS journals by source id …")
journal_papers: list[Paper] = []
try:
journal_papers = OpenAlexConnector().fetch_journals()
log.info(" → %d journal papers (openalex)", len(journal_papers))
except Exception as exc:
log.warning(" [openalex] journal fetch failed: %s", exc)

if os.getenv("SEMANTIC_SCHOLAR_API_KEY"):
log.info(" [s2] supplementing journals (JMLR/TMLR coverage) …")
try:
s2_journals = SemanticScholarConnector().fetch_journals()
log.info(" → %d journal papers (s2)", len(s2_journals))
journal_papers.extend(s2_journals)
except Exception as exc:
log.warning(" [s2] journal supplement failed: %s", exc)

return journal_papers


def run_pipeline(
queries: list[str] | None = None,
max_results_per_query: int = 50,
Expand All @@ -220,6 +249,7 @@ def run_pipeline(
today_max: int = 2000,
skip_conferences: bool = False,
conferences_only: bool = False,
journals_only: bool = False,
accumulate: bool = True,
max_age_days: int = 180,
backfill_from: str | None = None,
Expand All @@ -234,11 +264,16 @@ def run_pipeline(
arxiv = ArxivConnector()
all_papers: list[Paper] = []

# ── Journals-only mode: fetch journal papers and skip every other source ──
if journals_only:
log.info(" journals-only mode: fetching journal papers only")
all_papers.extend(_fetch_journal_papers())

# ── arXiv + ACL (skipped in conferences-only mode) ────────────────────────
if conferences_only:
log.info(" conferences-only mode: skipping arXiv and ACL")

if backfill_from and not conferences_only:
if backfill_from and not conferences_only and not journals_only:
# ── Backfill mode: sweep entire date range from given date to today ──
try:
from_date = date.fromisoformat(backfill_from)
Expand All @@ -258,7 +293,7 @@ def run_pipeline(
log.error(" [arxiv] fetch_range failed: %s", exc)
return {}

elif today_mode and not conferences_only:
elif today_mode and not conferences_only and not journals_only:
log.info(" [arxiv] today-mode: fetching all CS papers from last 2 days …")
try:
fetched = arxiv.fetch_today(max_results=today_max)
Expand All @@ -272,7 +307,7 @@ def run_pipeline(
log.warning(" [arxiv] fetch_today failed: %s — falling back to queries", exc)
today_mode = False # fall through to keyword queries

if not today_mode and not backfill_from and not conferences_only:
if not today_mode and not backfill_from and not conferences_only and not journals_only:
for query in queries:
log.info(" [arxiv] '%s' …", query)
try:
Expand All @@ -283,7 +318,7 @@ def run_pipeline(
log.info(" → %d papers", len(fetched))
all_papers.extend(fetched)

if not skip_acl and not conferences_only:
if not skip_acl and not conferences_only and not journals_only:
acl = ACLAnthologyConnector()
for query in queries:
log.info(" [acl] '%s' …", query)
Expand All @@ -295,7 +330,7 @@ def run_pipeline(
log.info(" → %d papers", len(fetched))
all_papers.extend(fetched)

if not skip_conferences or conferences_only:
if (not skip_conferences or conferences_only) and not journals_only:
if conferences_only:
# ── Conference-sync mode: fetch ALL papers directly from proceedings ──
# OpenReview — ICLR, NeurIPS, COLM (authenticates via env credentials)
Expand Down Expand Up @@ -341,31 +376,7 @@ def run_pipeline(
except Exception as exc:
log.warning(" [s2] bulk fetch_all failed: %s", exc)

# Bulk fetch — top CS journals via OpenAlex (keyless, systematic by
# source id). OpenAlex is the primary source because the S2 journal
# search is unreliable here — it filters by venue *short name* and
# query text, which yields 0 results and chronic HTTP 429/400s.
log.info(" [openalex] bulk-fetching top CS journals by source id …")
journal_papers: list[Paper] = []
try:
journal_papers = OpenAlexConnector().fetch_journals()
log.info(" → %d journal papers (openalex)", len(journal_papers))
except Exception as exc:
log.warning(" [openalex] journal fetch failed: %s", exc)

# Supplement with S2 only when a key is configured — it occasionally
# adds coverage for venues OpenAlex under-indexes (e.g. JMLR/TMLR).
# Dedup later in the pipeline removes overlaps; failures are non-fatal.
if os.getenv("SEMANTIC_SCHOLAR_API_KEY"):
log.info(" [s2] supplementing journals (JMLR/TMLR coverage) …")
try:
s2_journals = SemanticScholarConnector().fetch_journals()
log.info(" → %d journal papers (s2)", len(s2_journals))
journal_papers.extend(s2_journals)
except Exception as exc:
log.warning(" [s2] journal supplement failed: %s", exc)

all_papers.extend(journal_papers)
all_papers.extend(_fetch_journal_papers())

else:
# ── Keyword-query mode (used in daily pipeline if skip_conferences=False) ──
Expand All @@ -385,7 +396,7 @@ def run_pipeline(
all_papers.extend(fetched)

# ── OpenAlex (always, unless skip_conferences) ────────────────────────────
if not skip_conferences:
if not skip_conferences and not journals_only:
if conferences_only:
log.info(" [openalex] bulk-fetching ML/NLP/CV/IR papers …")
try:
Expand Down Expand Up @@ -416,9 +427,11 @@ def run_pipeline(

# ── Accumulate existing papers ────────────────────────────────────────────
if accumulate:
if conferences_only:
# Conference sync: accumulate existing conference + journal papers (no expiry)
# and also bring in arXiv papers so the site output stays complete.
if conferences_only or journals_only:
# Conference / journal sync: accumulate existing conference + journal
# papers (no expiry) and also bring in arXiv papers so the site output
# stays complete. In journals-only mode the freshly fetched journals
# merge with these; conference/arXiv rows are preserved, not dropped.
existing_conf = _load_conference_papers()
existing_journals = _load_journal_papers()
existing_arxiv = _load_arxiv_papers(max_age_days=max_age_days)
Expand Down Expand Up @@ -566,6 +579,11 @@ def _parse_args() -> argparse.Namespace:
"--conferences-only", action="store_true",
help="Fetch ONLY from conference sources (S2 + OpenReview). Skip arXiv and ACL.",
)
parser.add_argument(
"--journals-only", action="store_true",
help="Fetch ONLY journal papers (OpenAlex by source id, +S2 supplement). "
"Existing conference/arXiv papers are preserved (Railway upsert).",
)
parser.add_argument(
"--backfill-from", metavar="YYYY-MM-DD",
help="Fetch ALL arXiv CS papers from this date to today (e.g. 2026-01-01)",
Expand All @@ -592,6 +610,7 @@ def _parse_args() -> argparse.Namespace:
today_max=args.today_max,
skip_conferences=args.skip_conferences,
conferences_only=args.conferences_only,
journals_only=args.journals_only,
accumulate=not args.fresh_start,
max_age_days=args.max_age_days,
backfill_from=args.backfill_from,
Expand Down
Loading