Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions src/identifiers/migrations/0010_alter_identifier_id_type.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# Generated by Django 4.2.20 on 2026-02-20 12:01

from django.db import migrations, models


class Migration(migrations.Migration):
dependencies = [
("identifiers", "0009_deduplicate_identifiers_20220527"),
]

operations = [
migrations.AlterField(
model_name="identifier",
name="id_type",
field=models.CharField(
choices=[
("doi", "DOI"),
("uri", "URI Path"),
("pubid", "Publisher ID"),
("openalex", "OpenAlex ID"),
],
max_length=300,
),
),
]
1 change: 1 addition & 0 deletions src/identifiers/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
("doi", "DOI"),
("uri", "URI Path"),
("pubid", "Publisher ID"),
("openalex", "OpenAlex ID"),
)

IDENTIFIER_TYPES = {"uri", "pubid", "id", "doi"}
Expand Down
6 changes: 4 additions & 2 deletions src/metrics/management/commands/fetch_forward_links.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ def process_article(link, article):
issn = link.issn.contents[0] if link.issn else None

defaults = {
"object_type": "article",
"source": "crossref",
"year": link.year.contents[0],
"journal_title": link.journal_title.contents[0],
"article_title": link.article_title.contents[0],
Expand All @@ -29,7 +31,6 @@ def process_article(link, article):
models.ArticleLink.objects.get_or_create(
article=article,
doi=doi,
object_type="article",
defaults=defaults,
)
print("[ok]")
Expand All @@ -45,6 +46,8 @@ def process_book(link, article):
title = link.volume_title.contents[0]

defaults = {
"object_type": "book",
"source": "crossref",
"year": link.year.contents[0],
"title": title,
"component_number": link.component_number.contents[0]
Expand All @@ -56,7 +59,6 @@ def process_book(link, article):
models.BookLink.objects.get_or_create(
article=article,
doi=doi,
object_type="book",
defaults=defaults,
)
print("[ok]")
Expand Down
180 changes: 180 additions & 0 deletions src/metrics/management/commands/fetch_openalex_citations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
import time

import requests
from django.core.management.base import BaseCommand

from identifiers import models as im
from journal import models as jm
from metrics import models as mm
from submission import models as sm

OPENALEX_API = "https://api.openalex.org/works"
FIELDS = "id,doi,display_name,publication_year,type,primary_location,biblio"


def _get_source(work):
primary = work.get("primary_location") or {}
return primary.get("source") or {}


def _store_article_link(article, work, object_type):
raw_doi = work.get("doi") or ""
doi = raw_doi.replace("https://doi.org/", "").strip()
if not doi:
return False

source = _get_source(work)
biblio = work.get("biblio") or {}

defaults = {
"object_type": object_type,
"source": "openalex",
"article_title": work.get("display_name") or "",
"year": work.get("publication_year") or 0,
"journal_title": source.get("display_name") or "",
"journal_issn": source.get("issn_l") or "",
"volume": biblio.get("volume") or "",
"issue": biblio.get("issue") or "",
}

mm.ArticleLink.objects.update_or_create(
article=article,
doi=doi,
defaults=defaults,
)
return True


def _store_book_link(article, work):
raw_doi = work.get("doi") or ""
doi = raw_doi.replace("https://doi.org/", "").strip()
if not doi:
return False

defaults = {
"object_type": "book",
"source": "openalex",
"title": work.get("display_name") or "",
"year": work.get("publication_year") or 0,
"isbn_print": "",
"isbn_electronic": "",
"component_number": "",
}

mm.BookLink.objects.update_or_create(
article=article,
doi=doi,
defaults=defaults,
)
return True


class Command(BaseCommand):
"""
Fetches citing works from OpenAlex for articles that have an OpenAlex
identifier and stores them as ArticleLink or BookLink objects.
"""

help = "Fetches citation data from OpenAlex and stores as ArticleLink/BookLink records."

def add_arguments(self, parser):
parser.add_argument(
"--mailto",
default="",
help="Email address for OpenAlex polite pool.",
)
parser.add_argument(
"--journal_code",
default=None,
help="Limit to a specific journal by code.",
)
parser.add_argument(
"--article_id",
type=int,
default=None,
help="Fetch citations for a single article by its primary key.",
)
parser.add_argument(
"--delay",
type=float,
default=0.5,
help="Seconds to wait between page requests (default 0.5).",
)

def handle(self, *args, **options):
mailto = options["mailto"]
journal_code = options["journal_code"]
article_id = options["article_id"]
delay = options["delay"]

articles = sm.Article.objects.filter(
stage=sm.STAGE_PUBLISHED,
identifier__id_type="openalex",
).distinct()

if article_id:
articles = articles.filter(pk=article_id)
elif journal_code:
journal = jm.Journal.objects.get(code=journal_code)
articles = articles.filter(journal=journal)

total = articles.count()
self.stdout.write(f"Found {total} articles with OpenAlex IDs.")

for i, article in enumerate(articles, start=1):
openalex_id = article.identifier_set.filter(id_type="openalex").first()
if not openalex_id:
continue

work_id = openalex_id.identifier
self.stdout.write(f"[{i}/{total}] Article {article.pk} ({work_id})")

cursor = "*"
page_num = 0
stored = 0
skipped = 0

while cursor:
params = {
"filter": f"cites:{work_id}",
"per_page": 200,
"cursor": cursor,
"select": FIELDS,
}
if mailto:
params["mailto"] = mailto

try:
response = requests.get(OPENALEX_API, params=params, timeout=30)
response.raise_for_status()
data = response.json()
except requests.RequestException as e:
self.stdout.write(f" Request error on page {page_num}: {e}")
break

results = data.get("results", [])
meta = data.get("meta", {})
page_num += 1
self.stdout.write(f" Page {page_num}: {len(results)} works")

for work in results:
work_type = work.get("type") or ""
if work_type == "article":
ok = _store_article_link(article, work, "article")
elif work_type in ("book", "book-chapter"):
ok = _store_book_link(article, work)
else:
ok = _store_article_link(article, work, "other")

if ok:
stored += 1
else:
skipped += 1

time.sleep(delay)

cursor = meta.get("next_cursor")

self.stdout.write(f" Stored {stored} links, skipped {skipped} (no DOI).")

self.stdout.write("Done.")
107 changes: 107 additions & 0 deletions src/metrics/management/commands/fetch_openalex_ids.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
import time

import requests
from django.core.management.base import BaseCommand

from identifiers import models as im
from journal import models as jm
from submission import models as sm


class Command(BaseCommand):
"""
Fetches OpenAlex Work IDs for published articles and stores them as
Identifier objects with id_type="openalex".
"""

help = "Fetches OpenAlex Work IDs for articles that have a DOI but no OpenAlex identifier."

def add_arguments(self, parser):
parser.add_argument(
"--mailto",
default="",
help="Email address for OpenAlex polite pool.",
)
parser.add_argument(
"--journal_code",
default=None,
help="Limit to a specific journal by code.",
)
parser.add_argument(
"--article_id",
type=int,
default=None,
help="Fetch a single article by its primary key.",
)
parser.add_argument(
"--delay",
type=float,
default=0.5,
help="Seconds to wait between requests (default 0.5).",
)

def handle(self, *args, **options):
mailto = options["mailto"]
journal_code = options["journal_code"]
article_id = options["article_id"]
delay = options["delay"]

articles = (
sm.Article.objects.filter(
stage=sm.STAGE_PUBLISHED,
identifier__id_type="doi",
)
.exclude(
identifier__id_type="openalex",
)
.distinct()
)

if article_id:
articles = articles.filter(pk=article_id)
elif journal_code:
journal = jm.Journal.objects.get(code=journal_code)
articles = articles.filter(journal=journal)

total = articles.count()
self.stdout.write(f"Found {total} articles to process.")

for i, article in enumerate(articles, start=1):
doi_identifier = article.identifier_set.filter(id_type="doi").first()
if not doi_identifier:
continue

doi = doi_identifier.identifier
url = f"https://api.openalex.org/works/https://doi.org/{doi}?select=id"
if mailto:
url += f"&mailto={mailto}"

self.stdout.write(f"[{i}/{total}] {doi}", ending="... ")

try:
response = requests.get(url, timeout=30)
if response.status_code == 404:
self.stdout.write("not found, skipping.")
time.sleep(delay)
continue
response.raise_for_status()
data = response.json()
openalex_url = data.get("id", "")
# e.g. "https://openalex.org/W2741809807" -> "W2741809807"
work_id = openalex_url.rstrip("/").split("/")[-1]
if not work_id:
self.stdout.write("no ID in response, skipping.")
time.sleep(delay)
continue
im.Identifier.objects.get_or_create(
article=article,
id_type="openalex",
identifier=work_id,
)
self.stdout.write(f"stored {work_id}.")
except requests.RequestException as e:
self.stdout.write(f"request error: {e}")

time.sleep(delay)

self.stdout.write("Done.")
Loading
Loading