diff --git a/.github/workflows/journal-sync.yml b/.github/workflows/journal-sync.yml new file mode 100644 index 0000000..eae1fdf --- /dev/null +++ b/.github/workflows/journal-sync.yml @@ -0,0 +1,107 @@ +name: Journal Sync + +env: + FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true + +# Fetch ONLY journal papers (OpenAlex by source id, +S2 supplement) and persist +# them. Conference/arXiv rows are preserved — the Railway sync is an upsert and +# the pipeline reloads existing DBs before writing. Manual trigger only; the +# monthly conference-sync already refreshes journals as part of its run. + +on: + workflow_dispatch: + +permissions: + contents: write + pages: write + id-token: write + +concurrency: + group: push-to-main + cancel-in-progress: false + +jobs: + journal-sync: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: '3.11' + cache: 'pip' + cache-dependency-path: requirements.txt + + - run: pip install -r requirements.txt + + - name: Sync journal papers (OpenAlex by source id + S2 supplement) + env: + SEMANTIC_SCHOLAR_API_KEY: ${{ secrets.SEMANTIC_SCHOLAR_API_KEY }} + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + RAILWAY_DATABASE_URL: ${{ secrets.RAILWAY_DATABASE_URL }} + HF_TOKEN: ${{ secrets.HF_TOKEN }} + run: | + echo "Running: python src/pipeline.py --journals-only" + echo "Journals: JMLR, TMLR, TPAMI, IJCV, AIJ, TNNLS, NMI, CSUR, TIP, MLJ," + echo " TKDE, DAMI, NN, PR, CL, IPM, JACM, NatComms, TOIS," + echo " JAIR, TASLP, VLDBJ, TKDD, JAAMAS, TOG" + python src/pipeline.py --journals-only + + - name: Report results + run: | + python -c " + import json, os + path = 'site/data/journals_db.json' + if os.path.exists(path): + db = json.load(open(path)) + venues = {} + for p in db: + v = p.get('venue', 'unknown') + venues[v] = venues.get(v, 0) + 1 + print(f'Journal papers in journals_db.json: {len(db)}') + for v, n in sorted(venues.items(), key=lambda x: -x[1]): + print(f' {v}: {n}') + else: + print('journals_db.json not found') + " + + - name: Build journal recommender data + run: | + python src/sitegen/journal_recommender.py + + - name: Commit updated data + run: | + set -euo pipefail + git config user.name "github-actions[bot]" + git config user.email "github-actions[bot]@users.noreply.github.com" + git add site/data/ + if git diff --cached --quiet; then + echo "No changes to commit" + else + git commit -m "chore: journal sync [$(date -u +%Y-%m-%d)]" + for i in 1 2 3; do + git push origin main && break + echo "Push attempt $i failed, retrying with rebase (theirs)…" + git pull --rebase --autostash -X theirs origin main + sleep 5 + done + fi + + deploy: + needs: journal-sync + runs-on: ubuntu-latest + environment: + name: github-pages + url: ${{ steps.deployment.outputs.page_url }} + steps: + - uses: actions/checkout@v4 + with: + ref: main + - uses: actions/configure-pages@v4 + - uses: actions/upload-pages-artifact@v3 + with: + path: site/ + - name: Deploy to GitHub Pages + id: deployment + uses: actions/deploy-pages@v4 diff --git a/src/pipeline.py b/src/pipeline.py index a618af4..019e232 100644 --- a/src/pipeline.py +++ b/src/pipeline.py @@ -211,6 +211,35 @@ def _load_journal_papers() -> list[Paper]: # ── Pipeline ────────────────────────────────────────────────────────────────── +def _fetch_journal_papers() -> list[Paper]: + """Bulk-fetch top CS journals via OpenAlex (keyless, systematic by source id). + + OpenAlex is the primary source because the S2 journal search is unreliable + here — it filters by venue *short name* and query text, which yields 0 + results and chronic HTTP 429/400s. S2 is only a non-fatal supplement when a + key is configured, for venues OpenAlex under-indexes (e.g. JMLR/TMLR). + Dedup later in the pipeline removes any overlap. + """ + log.info(" [openalex] bulk-fetching top CS journals by source id …") + journal_papers: list[Paper] = [] + try: + journal_papers = OpenAlexConnector().fetch_journals() + log.info(" → %d journal papers (openalex)", len(journal_papers)) + except Exception as exc: + log.warning(" [openalex] journal fetch failed: %s", exc) + + if os.getenv("SEMANTIC_SCHOLAR_API_KEY"): + log.info(" [s2] supplementing journals (JMLR/TMLR coverage) …") + try: + s2_journals = SemanticScholarConnector().fetch_journals() + log.info(" → %d journal papers (s2)", len(s2_journals)) + journal_papers.extend(s2_journals) + except Exception as exc: + log.warning(" [s2] journal supplement failed: %s", exc) + + return journal_papers + + def run_pipeline( queries: list[str] | None = None, max_results_per_query: int = 50, @@ -220,6 +249,7 @@ def run_pipeline( today_max: int = 2000, skip_conferences: bool = False, conferences_only: bool = False, + journals_only: bool = False, accumulate: bool = True, max_age_days: int = 180, backfill_from: str | None = None, @@ -234,11 +264,16 @@ def run_pipeline( arxiv = ArxivConnector() all_papers: list[Paper] = [] + # ── Journals-only mode: fetch journal papers and skip every other source ── + if journals_only: + log.info(" journals-only mode: fetching journal papers only") + all_papers.extend(_fetch_journal_papers()) + # ── arXiv + ACL (skipped in conferences-only mode) ──────────────────────── if conferences_only: log.info(" conferences-only mode: skipping arXiv and ACL") - if backfill_from and not conferences_only: + if backfill_from and not conferences_only and not journals_only: # ── Backfill mode: sweep entire date range from given date to today ── try: from_date = date.fromisoformat(backfill_from) @@ -258,7 +293,7 @@ def run_pipeline( log.error(" [arxiv] fetch_range failed: %s", exc) return {} - elif today_mode and not conferences_only: + elif today_mode and not conferences_only and not journals_only: log.info(" [arxiv] today-mode: fetching all CS papers from last 2 days …") try: fetched = arxiv.fetch_today(max_results=today_max) @@ -272,7 +307,7 @@ def run_pipeline( log.warning(" [arxiv] fetch_today failed: %s — falling back to queries", exc) today_mode = False # fall through to keyword queries - if not today_mode and not backfill_from and not conferences_only: + if not today_mode and not backfill_from and not conferences_only and not journals_only: for query in queries: log.info(" [arxiv] '%s' …", query) try: @@ -283,7 +318,7 @@ def run_pipeline( log.info(" → %d papers", len(fetched)) all_papers.extend(fetched) - if not skip_acl and not conferences_only: + if not skip_acl and not conferences_only and not journals_only: acl = ACLAnthologyConnector() for query in queries: log.info(" [acl] '%s' …", query) @@ -295,7 +330,7 @@ def run_pipeline( log.info(" → %d papers", len(fetched)) all_papers.extend(fetched) - if not skip_conferences or conferences_only: + if (not skip_conferences or conferences_only) and not journals_only: if conferences_only: # ── Conference-sync mode: fetch ALL papers directly from proceedings ── # OpenReview — ICLR, NeurIPS, COLM (authenticates via env credentials) @@ -341,31 +376,7 @@ def run_pipeline( except Exception as exc: log.warning(" [s2] bulk fetch_all failed: %s", exc) - # Bulk fetch — top CS journals via OpenAlex (keyless, systematic by - # source id). OpenAlex is the primary source because the S2 journal - # search is unreliable here — it filters by venue *short name* and - # query text, which yields 0 results and chronic HTTP 429/400s. - log.info(" [openalex] bulk-fetching top CS journals by source id …") - journal_papers: list[Paper] = [] - try: - journal_papers = OpenAlexConnector().fetch_journals() - log.info(" → %d journal papers (openalex)", len(journal_papers)) - except Exception as exc: - log.warning(" [openalex] journal fetch failed: %s", exc) - - # Supplement with S2 only when a key is configured — it occasionally - # adds coverage for venues OpenAlex under-indexes (e.g. JMLR/TMLR). - # Dedup later in the pipeline removes overlaps; failures are non-fatal. - if os.getenv("SEMANTIC_SCHOLAR_API_KEY"): - log.info(" [s2] supplementing journals (JMLR/TMLR coverage) …") - try: - s2_journals = SemanticScholarConnector().fetch_journals() - log.info(" → %d journal papers (s2)", len(s2_journals)) - journal_papers.extend(s2_journals) - except Exception as exc: - log.warning(" [s2] journal supplement failed: %s", exc) - - all_papers.extend(journal_papers) + all_papers.extend(_fetch_journal_papers()) else: # ── Keyword-query mode (used in daily pipeline if skip_conferences=False) ── @@ -385,7 +396,7 @@ def run_pipeline( all_papers.extend(fetched) # ── OpenAlex (always, unless skip_conferences) ──────────────────────────── - if not skip_conferences: + if not skip_conferences and not journals_only: if conferences_only: log.info(" [openalex] bulk-fetching ML/NLP/CV/IR papers …") try: @@ -416,9 +427,11 @@ def run_pipeline( # ── Accumulate existing papers ──────────────────────────────────────────── if accumulate: - if conferences_only: - # Conference sync: accumulate existing conference + journal papers (no expiry) - # and also bring in arXiv papers so the site output stays complete. + if conferences_only or journals_only: + # Conference / journal sync: accumulate existing conference + journal + # papers (no expiry) and also bring in arXiv papers so the site output + # stays complete. In journals-only mode the freshly fetched journals + # merge with these; conference/arXiv rows are preserved, not dropped. existing_conf = _load_conference_papers() existing_journals = _load_journal_papers() existing_arxiv = _load_arxiv_papers(max_age_days=max_age_days) @@ -566,6 +579,11 @@ def _parse_args() -> argparse.Namespace: "--conferences-only", action="store_true", help="Fetch ONLY from conference sources (S2 + OpenReview). Skip arXiv and ACL.", ) + parser.add_argument( + "--journals-only", action="store_true", + help="Fetch ONLY journal papers (OpenAlex by source id, +S2 supplement). " + "Existing conference/arXiv papers are preserved (Railway upsert).", + ) parser.add_argument( "--backfill-from", metavar="YYYY-MM-DD", help="Fetch ALL arXiv CS papers from this date to today (e.g. 2026-01-01)", @@ -592,6 +610,7 @@ def _parse_args() -> argparse.Namespace: today_max=args.today_max, skip_conferences=args.skip_conferences, conferences_only=args.conferences_only, + journals_only=args.journals_only, accumulate=not args.fresh_start, max_age_days=args.max_age_days, backfill_from=args.backfill_from,