PRIDE-Archive · ypriverol · Feb 18, 2026 · Feb 12, 2026 · Feb 12, 2026 · Feb 16, 2026
diff --git a/paper/WileyNJD-AMA.bst b/paper/WileyNJD-AMA.bst
diff --git a/paper/figures/figure1_geographic_distribution.pdf b/paper/figures/figure1_geographic_distribution.pdf
diff --git a/paper/figures/figure1b_regional_distribution.pdf b/paper/figures/figure1b_regional_distribution.pdf
diff --git a/paper/figures/figure2_temporal_trends.pdf b/paper/figures/figure2_temporal_trends.pdf
diff --git a/paper/figures/figure4_protocol_usage.pdf b/paper/figures/figure4_protocol_usage.pdf
diff --git a/paper/figures/figure5_dataset_concentration.pdf b/paper/figures/figure5_dataset_concentration.pdf
diff --git a/paper/figures/figure6_top_datasets.pdf b/paper/figures/figure6_top_datasets.pdf
diff --git a/paper/figures/figure7_country_bubble.pdf b/paper/figures/figure7_country_bubble.pdf
diff --git a/paper/figures/figure_bot_detection_overview.pdf b/paper/figures/figure_bot_detection_overview.pdf
diff --git a/paper/figures/figure_filetype_by_region.pdf b/paper/figures/figure_filetype_by_region.pdf
diff --git a/paper/figures/figure_hub_distribution.pdf b/paper/figures/figure_hub_distribution.pdf
diff --git a/paper/figures/supp_figure_agreement.pdf b/paper/figures/supp_figure_agreement.pdf
diff --git a/paper/figures/supp_pride_overview.png b/paper/figures/supp_pride_overview.png
diff --git a/paper/main.pdf b/paper/main.pdf
diff --git a/paper/main.tex b/paper/main.tex
diff --git a/paper/proteomics.bst b/paper/proteomics.bst
diff --git a/paper/references.bib b/paper/references.bib
@@ -1,7 +1,18 @@
+@article{Deutsch2026,
+  title = {The {ProteomeXchange} consortium in 2026: making proteomics data {FAIR}},
+  author = {Deutsch, Eric W. and Bandeira, Nuno and Perez-Riverol, Yasset and Sharma, Vagisha and Carver, Jeremy J. and Mendoza, Luis and Kundu, Deepti J. and Bandla, Chakradhar and Kamatchinathan, Selvakumar and Hewapathirana, Suresh and Sun, Zhi and Kawano, Shin and Okuda, Shujiro and Connolly, Brian and MacLean, Brendan and MacCoss, Michael J. and Chen, Tao and Zhu, Yunping and Ishihama, Yasushi and Vizca{\'\i}no, Juan Antonio},
+  journal = {Nucleic Acids Res},
+  volume = {54},
+  number = {D1},
+  pages = {D459--D469},
+  year = {2026},
+  doi = {10.1093/nar/gkaf1146}
+}
+
 @article{PerezRiverol2022reanalysis,
   title = {Proteomic repository data submission, dissemination, and reuse: key messages},
   author = {Perez-Riverol, Yasset},
-  journal = {Expert Review of Proteomics},
+  journal = {Expert Rev Proteomics},
   volume = {19},
   number = {7-12},
   pages = {297--310},
@@ -12,7 +23,7 @@ @article{PerezRiverol2022reanalysis
 @article{DiTommaso2017,
   title = {Nextflow enables reproducible computational workflows},
   author = {Di Tommaso, Paolo and Chatzou, Maria and Floden, Evan W and Prieto Barja, Pablo and Palumbo, Emilio and Notredame, Cedric},
-  journal = {Nature Biotechnology},
+  journal = {Nat Biotechnol},
   volume = {35},
   number = {4},
   pages = {316--319},
@@ -23,7 +34,7 @@ @article{DiTommaso2017
 @article{Dai2021,
   title = {A proteomics sample metadata representation for multiomics integration and big data analysis},
   author = {Dai, Chengxin and F{\"u}llgrabe, Anja and Pfeuffer, Julianus and Solovyeva, Elizaveta M and Deng, Jingwen and Moreno, Pablo and Kamatchinathan, Selvakumar and Kundu, Deepti Jaiswal and George, Nancy and Fexova, Silvie and others},
-  journal = {Nature Communications},
+  journal = {Nat Commun},
   volume = {12},
   number = {1},
   pages = {5854},
@@ -34,7 +45,7 @@ @article{Dai2021
 @article{PerezRiverol2019,
   title = {Quantifying the impact of public omics data},
   author = {Perez-Riverol, Yasset and Zorin, Andrey and Dass, Gaurhari and Vu, Manh-Tu and Xu, Rui and Hermjakob, Henning and Vizcaíno, Juan Antonio},
-  journal = {Nature Communications},
+  journal = {Nat Commun},
   volume = {10},
   number = {1},
   pages = {3512},
@@ -46,7 +57,7 @@ @article{PerezRiverol2019
 @article{PerezRiverol2025,
   title = {The {PRIDE} database at 20 years: 2025 update},
   author = {Perez-Riverol, Yasset and Bandla, Chakradhar and Kundu, Deepti J and Kamatchinathan, Selvakumar and Bai, Jingwen and Hewapathirana, Suresh and John, Nithu Sara and Riera Duocastella, Marc and Vibranovski, Maria D and Hermjakob, Henning and Vizcaíno, Juan Antonio},
-  journal = {Nucleic Acids Research},
+  journal = {Nucleic Acids Res},
   volume = {53},
   number = {D1},
   pages = {D543--D553},
@@ -58,7 +69,7 @@ @article{PerezRiverol2025
 @article{Kamatchinathan2025,
   title = {pridepy: A {P}ython Package to Download and Search Data from {PRIDE} Database},
   author = {Kamatchinathan, Selvakumar and Hewapathirana, Suresh and Bandla, Chakradhar and Perez-Riverol, Yasset},
-  journal = {Journal of Open Source Software},
+  journal = {J Open Source Softw},
   volume = {10},
   number = {107},
   pages = {7563},
@@ -69,7 +80,7 @@ @article{Kamatchinathan2025
 @article{Dai2024,
   title = {quantms: a cloud-based pipeline for quantitative proteomics enables the reanalysis of public proteomics data},
   author = {Dai, Chengxin and Pfeuffer, Julianus and Wang, Hong and Zheng, Ping and Käll, Lukas and Sachsenberg, Timo and Demichev, Vadim and Bai, Mingze and Kohlbacher, Oliver and Perez-Riverol, Yasset},
-  journal = {Nature Methods},
+  journal = {Nat Methods},
   volume = {21},
   number = {9},
   pages = {1603--1607},
@@ -93,7 +104,7 @@ @article{Perez2019
 @article{Perez-Riverol2022,
   title = {The {PRIDE} database resources in 2022: a hub for mass spectrometry-based proteomics evidences},
   author = {Perez-Riverol, Yasset and Bai, Jingwen and Bandla, Chakradhar and Garc{\'\i}a-Seisdedos, David and Hewapathirana, Suresh and Kamatchinathan, Selvakumar and Kundu, Deepti J and Prakash, Ananth and Frericks-Zipper, Anika and Eisenacher, Martin and others},
-  journal = {Nucleic Acids Research},
+  journal = {Nucleic Acids Res},
   volume = {50},
   number = {D1},
   pages = {D483--D490},
@@ -105,7 +116,7 @@ @article{Perez-Riverol2022
 @article{Leinonen2011,
   title = {The {European Nucleotide Archive}},
   author = {Leinonen, Rasko and Akhtar, Ruth and Birney, Ewan and Bower, Lawrence and Cerdeno-T{\'a}rraga, Ana and Cheng, Yuan and Cleland, Iain and Faruque, Nadeem and Goodgame, Neil and Gibson, Richard and others},
-  journal = {Nucleic Acids Research},
+  journal = {Nucleic Acids Res},
   volume = {39},
   number = {suppl\_1},
   pages = {D28--D31},
@@ -117,7 +128,7 @@ @article{Leinonen2011
 @article{UniProtConsortium2023,
   title = {{UniProt}: the universal protein knowledgebase in 2023},
   author = {{The UniProt Consortium}},
-  journal = {Nucleic Acids Research},
+  journal = {Nucleic Acids Res},
   volume = {51},
   number = {D1},
   pages = {D483--D489},
@@ -130,6 +141,7 @@ @techreport{Imperva2023
   title = {Bad Bot Report 2023: The Account Takeover Edition},
   author = {{Imperva}},
   institution = {Imperva Inc.},
+  address = {San Mateo, CA},
   year = {2023},
   url = {https://www.imperva.com/resources/reports/2023-bad-bot-report/},
   note = {Annual analysis of automated bot traffic patterns across the internet}
@@ -138,7 +150,7 @@ @techreport{Imperva2023
 @article{Jonker2019,
   title = {Fingerprinting tooling used for {SSH} dictionary attack},
   author = {Jonker, Mattijs and Stone-Gross, Brett and Plonka, David and Boehme, Alistair},
-  journal = {Digital Investigation},
+  journal = {Digit Investig},
   volume = {31},
   pages = {S138--S146},
   year = {2019},
@@ -176,7 +188,7 @@ @inproceedings{Cabri2021
 @article{Habibi2020,
   title = {Bot detection using {U}ser {A}gent-based fingerprinting},
   author = {Habibi Lashkari, Arash and Kadir, Andi Fitriah Abdul and Gonzalez, Hugo and Mbah, Kenneth F and Ghorbani, Ali A},
-  journal = {Computers \& Security},
+  journal = {Comput Secur},
   volume = {95},
   pages = {101869},
   year = {2020},
@@ -197,7 +209,7 @@ @inproceedings{Liu2008
 @article{Breunig2000,
   title = {{LOF}: identifying density-based local outliers},
   author = {Breunig, Markus M and Kriegel, Hans-Peter and Ng, Raymond T and Sander, J{\"o}rg},
-  journal = {ACM SIGMOD Record},
+  journal = {ACM SIGMOD Rec},
   volume = {29},
   number = {2},
   pages = {93--104},
@@ -209,7 +221,7 @@ @article{Breunig2000
 @article{Scholkopf2001,
   title = {Estimating the support of a high-dimensional distribution},
   author = {Sch{\"o}lkopf, Bernhard and Platt, John C and Shawe-Taylor, John and Smola, Alex J and Williamson, Robert C},
-  journal = {Neural Computation},
+  journal = {Neural Comput},
   volume = {13},
   number = {7},
   pages = {1443--1471},
@@ -229,7 +241,7 @@ @article{Vaswani2017
 @article{Chandola2009,
   title = {Anomaly detection: A survey},
   author = {Chandola, Varun and Banerjee, Arindam and Kumar, Vipin},
-  journal = {ACM Computing Surveys},
+  journal = {ACM Comput Surv},
   volume = {41},
   number = {3},
   pages = {1--58},
@@ -249,7 +261,7 @@ @inproceedings{Devlin2019
 @article{Pedregosa2011,
   title = {Scikit-learn: Machine learning in {P}ython},
   author = {Pedregosa, Fabian and Varoquaux, Ga{\"e}l and Gramfort, Alexandre and Michel, Vincent and Thirion, Bertrand and Grisel, Olivier and Blondel, Mathieu and Prettenhofer, Peter and Weiss, Ron and Dubourg, Vincent and others},
-  journal = {Journal of Machine Learning Research},
+  journal = {J Mach Learn Res},
   volume = {12},
   pages = {2825--2830},
   year = {2011}
@@ -302,7 +314,7 @@ @inproceedings{Lundberg2017
 @article{Wilkinson2016,
   title = {The {FAIR} Guiding Principles for scientific data management and stewardship},
   author = {Wilkinson, Mark D and Dumontier, Michel and Aalbersberg, IJsbrand Jan and Appleton, Gabrielle and Axton, Myles and Baak, Arie and Blomberg, Niklas and Boiten, Jan-Willem and da Silva Santos, Luiz Bonino and Bourne, Philip E and others},
-  journal = {Scientific Data},
+  journal = {Sci Data},
   volume = {3},
   number = {1},
   pages = {160018},
@@ -326,7 +338,7 @@ @article{Orr2025
 @article{Demichev2020,
   author    = {Demichev, Vadim and Messner, Christoph B. and Vernardis, Spyros I. and Lilley, Kathryn S. and Ralser, Markus},
   title     = {{DIA-NN}: neural networks and interference correction enable deep proteome coverage in high throughput},
-  journal   = {Nature Methods},
+  journal   = {Nat Methods},
   year      = {2020},
   volume    = {17},
   number    = {1},
@@ -337,7 +349,7 @@ @article{Demichev2020
 @article{Cox2008,
   author    = {Cox, J{\"u}rgen and Mann, Matthias},
   title     = {{MaxQuant} enables high peptide identification rates, individualized p.p.b.-range mass accuracies and proteome-wide protein quantification},
-  journal   = {Nature Biotechnology},
+  journal   = {Nat Biotechnol},
   year      = {2008},
   volume    = {26},
   number    = {12},
@@ -348,7 +360,7 @@ @article{Cox2008
 @article{Kong2017,
   author    = {Kong, Andy T. and Leprevost, Felipe V. and Avtonomov, Dmitry M. and Mellacheruvu, Dattatreya and Nesvizhskii, Alexey I.},
   title     = {{MSFragger}: ultrafast and comprehensive peptide identification in mass spectrometry-based proteomics},
-  journal   = {Nature Methods},
+  journal   = {Nat Methods},
   year      = {2017},
   volume    = {14},
   pages     = {513--520},
@@ -368,7 +380,7 @@ @inproceedings{Raasveldt2019
 @article{Desiere2006,
   title = {The {PeptideAtlas} project},
   author = {Desiere, Frank and Deutsch, Eric W and King, Nichole L and Nesvizhskii, Alexey I and Mallick, Parag and Eng, Jimmy and Chen, Sharon and Eddes, James and Loevenich, Sandra N and Aebersold, Ruedi},
-  journal = {Nucleic Acids Research},
+  journal = {Nucleic Acids Res},
   volume = {34},
   number = {suppl\_1},
   pages = {D655--D658},
@@ -380,7 +392,7 @@ @article{Desiere2006
 @article{Craig2004,
   title = {Open source system for analyzing, validating, and storing protein identification data},
   author = {Craig, Robertson and Cortens, John P and Beavis, Ronald C},
-  journal = {Journal of Proteome Research},
+  journal = {J Proteome Res},
   volume = {3},
   number = {6},
   pages = {1234--1242},
@@ -392,7 +404,7 @@ @article{Craig2004
 @article{Decoster2022,
   title = {{Scop3P}: a comprehensive resource of human phosphosites within their full context},
   author = {Decoster, Pathmanaban and Nkuipou-Kenfack, Eliane and Van Den Bossche, Tim and Menschaert, Gerben and Martens, Lennart and Gevaert, Kris and Coornaert, Bert and Versele, Mathieu and Ndah, Elvis and Costanzo, Michael C and others},
-  journal = {Journal of Proteome Research},
+  journal = {J Proteome Res},
   volume = {22},
   number = {1},
   pages = {106--118},
@@ -404,7 +416,7 @@ @article{Decoster2022
 @article{Shao2020,
   title = {{MatrisomeDB} 2.0: 2023 updates to the {ECM}-protein knowledge database},
   author = {Shao, Xinhao and Gomez, Clarissa D and Kapoor, Nandini and Considine, James M and Grams, Christopher and Gao, Yu (Tom) and Naba, Alexandra},
-  journal = {Nucleic Acids Research},
+  journal = {Nucleic Acids Res},
   volume = {51},
   number = {D1},
   pages = {D1519--D1530},
@@ -421,3 +433,11 @@ @article{Dai2024pmultiqc
   doi = {10.1101/2025.11.02.685980},
   publisher = {Cold Spring Harbor Laboratory}
 }
+
+@misc{WellcomeLMIC,
+  title = {Low- and middle-income countries},
+  author = {{Wellcome Trust}},
+  year = {2025},
+  howpublished = {\url{https://wellcome.org/research-funding/guidance/prepare-to-apply/low-and-middle-income-countries}},
+  note = {Based on OECD DAC list. Accessed February 2026}
+}
diff --git a/paper/supplementary.pdf b/paper/supplementary.pdf
diff --git a/paper/supplementary.tex b/paper/supplementary.tex
@@ -31,12 +31,12 @@
 }
 
 \title{\textbf{Supplementary Notes}\\[0.5em]
-\large Tracking Dataset Reuse in Proteomics: A Comprehensive Analysis of PRIDE Archive Downloads}
+\large Tracking dataset reuse in proteomics: a comprehensive analysis of PRIDE data download statistics}
 
 \author{
 \small Suresh Hewapathirana, Jingwen Bai, Chakradhar Bandla, Selvakumar Kamatchinathan,\\
 \small Deepti J Kundu, Nithu Sara John, Boma Brown-Harry, Nandana Madhusoodanan,\\
-\small Marc Riera Duocastella, Juan Antonio Vizca\'{i}no, Yasset Perez-Riverol
+\small Joan Marc Riera Duocastella, Juan Antonio Vizca\'{i}no, Yasset Perez-Riverol
 }
 \date{}
 
@@ -64,12 +64,12 @@ \subsection{Log Processing Workflow}
 
 \subsection{Data Scale and Coverage}
 
-The processed dataset covers the period January 2020 through January 2025 (Figure~\ref{fig:pride_overview}). Key metrics include 47.35 million total file downloads across 32,106 distinct projects, 2.26 million unique files, and 807,156 unique users. The analyzed projects represent 96.4\% of all public PRIDE datasets, and 88.0\% of PRIDE files have been downloaded at least once. Downloads originate from 136 countries with more than 100 downloads each.
+The processed Parquet file covers the period January 2021 through December 2025 (Figure~\ref{fig:pride_overview}). Key metrics include 159.3 million total file downloads across 35,528 distinct projects, 2.98 million unique files, and 9.80 million unique users. The analyzed projects represent essentially all public PRIDE datasets, and 91.7\% of PRIDE files have been downloaded at least once. Downloads originate from 194 countries with more than 100 downloads each.
 
 \begin{figure}[H]
 \centering
 \includegraphics[width=\textwidth]{figures/supp_pride_overview.png}
-\caption{Overview of PRIDE download activity (2020--2025). Overall scale metrics, reuse intensity across projects/files/users, file coverage, and geographic reach.}
+\caption{Overview of PRIDE download activity (2021--2025). Overall scale metrics, reuse intensity across projects/files/users, file coverage, and geographic reach.}
 \label{fig:pride_overview}
 \end{figure}
 
@@ -375,7 +375,7 @@ \subsection{Inter-Method Agreement}
 
 \subsection{Classification Outcome Comparison}
 
-On the 1M-record benchmark sample, the two methods produce different classification distributions (Figure~\ref{fig:method_comparison}). The Rules method classifies 29\% of locations as bots (72\% of downloads), while Deep classifies 34\% (77\% of downloads).
+On the 1M-record benchmark sample, the two methods produce different classification distributions (Figure~\ref{fig:method_comparison}). The Rules method classifies 29\% of locations as bots (72\% of downloads), while Deep classifies 35\% (77\% of downloads).
 
 \begin{figure}[H]
 \centering
@@ -451,7 +451,7 @@ \subsection{Regional Distribution}
 \begin{figure}[H]
 \centering
 \includegraphics[width=0.65\textwidth]{figures/figure1b_regional_distribution.pdf}
-\caption{PRIDE downloads by world region (after bot removal, 2020--2025).}
+\caption{PRIDE downloads by world region (after bot removal, 2021--2025).}
 \label{fig:regional_supp}
 \end{figure}
 
@@ -492,7 +492,7 @@ \subsection{Top Downloaded Datasets}
 
 \subsection{Dataset Download Consistency}
 
-The consistency heatmap (Figure~7B in the main text) shows that top datasets maintain sustained download activity across multiple years rather than one-time spikes. Beyond this, we rank datasets by a consistency score combining low coefficient of variation with high activity ratio (Figure~\ref{fig:consistency_scores}). PXD013868 achieves the highest consistency score (0.788), indicating steady, reliable reuse across the study period.
+The consistency heatmap (Figure~5B in the main text) shows that top datasets maintain sustained download activity across multiple years rather than one-time spikes. Beyond this, we rank datasets by a consistency score combining low coefficient of variation with high activity ratio (Figure~\ref{fig:consistency_scores}). PXD013868 achieves the highest consistency score (0.788), indicating steady, reliable reuse across the study period.
 
 \begin{figure}[H]
 \centering
@@ -523,25 +523,14 @@ \subsection{Country-Level Usage Intensity}
 \label{fig:bubble_chart}
 \end{figure}
 
-\subsection{ProteomeXchange Resources}
-
-PRIDE hosts 83.2\% of all ProteomeXchange datasets, followed by MassIVE (6.9\%) and iProX (5.5\%) (Figure~\ref{fig:px_resources}). This dominance reflects PRIDE's position as the primary public repository for mass spectrometry proteomics data.
-
-\begin{figure}[H]
-\centering
-\includegraphics[width=0.5\textwidth]{figures/supp_px_resources.png}
-\caption{Distribution of datasets across ProteomeXchange partner resources.}
-\label{fig:px_resources}
-\end{figure}
-
 % ======================================================================
 \section{S9. Limitations}
 \label{sec:limitations}
 % ======================================================================
 
 Our ground truth labels are heuristic-derived rather than manually verified, which may introduce systematic biases in the benchmark evaluation. The geographic attribution relies on IP geolocation, which can be inaccurate for users behind VPNs or institutional proxies. The 2025 data is from a partial year, making year-over-year comparisons with full years approximate. Finally, we cannot distinguish multiple individual users who share a geographic location from a single user, which may affect location-level statistics.
 
-\bibliographystyle{unsrtnat}
+\bibliographystyle{proteomics}
 \bibliography{references}
 
 \end{document}