diff --git a/paper/WileyNJD-AMA.bst b/paper/WileyNJD-AMA.bst new file mode 100644 index 0000000..d0f71e7 --- /dev/null +++ b/paper/WileyNJD-AMA.bst @@ -0,0 +1,1126 @@ +% +% THIS IS AN ALPHA VERSION!!! +% +% USE AT YOUR OWN RISK +% +% This should work for articles...I don't know if it works well for +% books, etc (give it a try and see!). +% +% PLEASE COMPARE RESULTS WITH THE INSTRUCTIONS FOR AUTHORS FOR THE +% JOURNAL YOU ARE SUBMITTING FOR...I CANNOT ACCEPT RESPONSIBILITY +% FOR REJECTED MANUSCRIPTS (but I will try to fix it if you point out +% a bug) +% +% AMA Manual of Style (JAMA, Cancer, many others..) +% -Up to 6 authors, otherwise 3 authors et al. +% -Title in italics +% -numeric labels +% -order-of-reference. +% +% Author L, Author S, Author D, et al. Title. Journal. +% YYYY;VOL(NUM):PPP-PPP. +% +% +% History +% 9/30/85 (HWT) IEETR Original version, by Howard Trickey. +% 1/29/88 (OP&HWT) Updated for BibTeX version 0.99a, Oren Patashnik; +% 3/27/02 IEETR style used as framework. Formats heavily changed by +% Eric Kort (eric.kort@vai.org) +% 1/02/18 Update to fit the 10th edition of AMA citation style +% Thomas DESCHLER (thomas@deschler.fr) +% +% THIS VERSION DOES NOT WORK WITH BIBTEX 0.98i. +% + +ENTRY + { address + author + booktitle + chapter + doi + edition + editor + howpublished + institution + journal + key + month + note + number + organization + pages + publisher + school + series + title + type + url + volume + year + } + {} + { label } + +INTEGERS { output.state before.all mid.sentence after.quote after.sentence + after.quoted.block after.block } + +FUNCTION {init.state.consts} +{ #0 'before.all := + #1 'mid.sentence := + #2 'after.quote := + #3 'after.sentence := + #4 'after.quoted.block := + #5 'after.block := +} + +STRINGS { s t } + +FUNCTION {output.nonnull} +{ 's := + output.state mid.sentence = + { "" * write$ } + { output.state after.quote = + { " " * write$ } + { output.state after.block = + { add.period$ write$ + newline$ + "\newblock " write$ + } + { output.state before.all = + 'write$ + { output.state after.quoted.block = + { write$ + newline$ + "\newblock " write$ + } + { add.period$ " " * write$ } + if$ + } + if$ + } + if$ + } + if$ + mid.sentence 'output.state := + } + if$ + s +} + +FUNCTION {output} +{ duplicate$ empty$ + 'pop$ + 'output.nonnull + if$ +} + +FUNCTION {output.check} +{ 't := + duplicate$ empty$ + { pop$ "empty " t * " in " * cite$ * warning$ } + 'output.nonnull + if$ +} + +FUNCTION {output.bibitem} +{ newline$ + "\bibitem{" write$ + cite$ write$ + "}" write$ + newline$ + "" + before.all 'output.state := +} + +FUNCTION {blank.sep} +{ after.quote 'output.state := +} + +FUNCTION {fin.entry} +{ + doi empty$ + { + output.state after.quoted.block = + 'skip$ + 'add.period$ + if$ + } + { + } + if$ + write$ + newline$ +} + +FUNCTION {new.block} +{ output.state before.all = + 'skip$ + { output.state after.quote = + { after.quoted.block 'output.state := } + { after.block 'output.state := } + if$ + } + if$ +} + +FUNCTION {new.sentence} +{ output.state after.block = + 'skip$ + { output.state before.all = + 'skip$ + { after.sentence 'output.state := } + if$ + } + if$ +} + +FUNCTION {not} +{ { #0 } + { #1 } + if$ +} + +FUNCTION {and} +{ 'skip$ + { pop$ #0 } + if$ +} + +FUNCTION {or} +{ { pop$ #1 } + 'skip$ + if$ +} + +FUNCTION {new.block.checka} +{ empty$ + 'skip$ + 'new.block + if$ +} + +FUNCTION {new.block.checkb} +{ empty$ + swap$ empty$ + and + 'skip$ + 'new.block + if$ +} + +FUNCTION {new.sentence.checka} +{ empty$ + 'skip$ + 'new.sentence + if$ +} + +FUNCTION {field.or.null} +{ duplicate$ empty$ + { pop$ "" } + 'skip$ + if$ +} + +FUNCTION {emphasize} +{ duplicate$ empty$ + { pop$ "" } + { "{\it " swap$ * "}" * } + if$ +} + +INTEGERS {nameptr namesleft numnames etal} + +FUNCTION {format.names} +{ 's := % push the name s, pop s and author (already on stack), assign author to s + #1 'nameptr := + #0 'etal := + + s num.names$ 'numnames := + + numnames #6 > + {#3 'numnames := + #1 'etal := + } + {} + %end if + if$ + + numnames 'namesleft := + { namesleft #0 > } + + {s nameptr "{ll }{v}{f{}}" format.name$ 't := + namesleft #1 > + {t * ", " * } + {t} + %end if + if$ + nameptr #1 + 'nameptr := + namesleft #1 - 'namesleft := + } + while$ + + etal + {", et al. "} + {". "} + if$ + * +} + +FUNCTION {format.authors} +{ author empty$ + { "" } + { author format.names } + if$ +} + +FUNCTION {format.editors} +{ editor empty$ + { "" } + { editor format.names + editor num.names$ #1 > + { "\kern-2pt, eds." * } + { "\kern-2pt, ed." * } + if$ + } + if$ +} + +FUNCTION {format.title} +{ title empty$ + { "" } + { title "." *} + if$ +} + +FUNCTION {format.journal} +{ journal empty$ + { "" } + { journal "" *} + if$ +} + +FUNCTION {format.title.p} +{ title empty$ + { "" } + { title ". " *} + if$ +} + +FUNCTION {n.dashify} +{ 't := + "" + { t empty$ not } + { t #1 #1 substring$ "-" = + { t #1 #2 substring$ "-" = not + { "-" * + t #2 global.max$ substring$ 't := + } + { { t #1 #1 substring$ "-" = } + { "-" * + t #2 global.max$ substring$ 't := + } + while$ + } + if$ + } + { t #1 #1 substring$ * + t #2 global.max$ substring$ 't := + } + if$ + } + while$ +} + +FUNCTION {format.date} +{ year empty$ + { "" } + { " " * year } + if$ +} + +FUNCTION {inproformat.date} +{ year empty$ + { "" } + { "; " * year } + if$ +} + +FUNCTION {incollecformat.pages} +{ pages empty$ + { "" } + {" (pp. " pages * ")" *} + if$ +} + +FUNCTION {format.btitle} +{ title emphasize +} + +FUNCTION {tie.or.space.connect} +{ duplicate$ text.length$ #3 < + { "~" } + { " " } + if$ + swap$ * * +} + +FUNCTION {either.or.check} +{ empty$ + 'pop$ + { "can't use both " swap$ * " fields in " * cite$ * warning$ } + if$ +} + +FUNCTION {format.bvolume} +{ volume empty$ + { "" } + { ". " volume * + series empty$ + 'skip$ + { " of " * series emphasize * } + if$ + "volume and number" number either.or.check + } + if$ +} + +FUNCTION {format.number.series} +{ volume empty$ + { number empty$ + { series field.or.null } + { output.state mid.sentence = + { "no.~" } + { "No.~" } + if$ + number * + series empty$ + { "there's a number but no series in " cite$ * warning$ } + { " in " * series * } + if$ + } + if$ + } + { "" } + if$ +} + +FUNCTION {format.edition} +{ edition empty$ + { "" } + { edition "l" change.case$ "~ed." * } + if$ +} + +INTEGERS { multiresult } + +FUNCTION {multi.page.check} +{ 't := + #0 'multiresult := + { multiresult not + t empty$ not + and + } + { t #1 #1 substring$ + duplicate$ "-" = + swap$ duplicate$ "," = + swap$ "+" = + or or + { #1 'multiresult := } + { t #2 global.max$ substring$ 't := } + if$ + } + while$ + multiresult +} + +FUNCTION {format.pages} +{ pages empty$ + { "" } + {"\string: " pages *} + if$ +} + +FUNCTION {format.volume} +{ volume empty$ + { "" } + { "\string; " volume * } + if$ +} + +FUNCTION {format.number} +{ number empty$ + { "" } + { "(" number * ")" *} + if$ +} + +FUNCTION {format.chapter.pages} +{ chapter empty$ + 'format.pages + { type empty$ + { "ch.~" chapter * } + { type "l" change.case$ chapter tie.or.space.connect } + if$ + pages empty$ + 'skip$ + { "" * format.pages * } + if$ + } + if$ +} + +FUNCTION {format.in.ed.booktitle} +{ booktitle empty$ + { "In: " } + { "In: " + editor empty$ + 'skip$ + { " " * format.editors * " " * booktitle emphasize * "" * } + if$ + } + if$ +} + +FUNCTION {format.thesis.type} +{ type empty$ + 'skip$ + { pop$ + output.state after.block = + { type "t" change.case$ } + { type "l" change.case$ } + if$ + } + if$ +} + +FUNCTION {empty.misc.check} +{ author empty$ title empty$ howpublished empty$ + month empty$ year empty$ note empty$ + and and and and and + { "all relevant fields are empty in " cite$ * warning$ } + 'skip$ + if$ +} + +FUNCTION {format.tr.number} +{ type empty$ + { "Tech. Rep." } + 'type + if$ + number empty$ + { "l" change.case$ } + { number tie.or.space.connect } + if$ +} + +FUNCTION {format.addr.pub} +{ publisher empty$ + { "" } + { address empty$ + { "" } + { address ": " * } + if$ + publisher * + } + if$ +} + +FUNCTION {format.paddress} +{ address empty$ + { "" } + { "(" address * ")" * } + if$ +} + +FUNCTION {format.ppaddress} +{ address empty$ + { "" } + { "; " address * "" * } + if$ +} + + +FUNCTION {format.article.crossref} +{ key empty$ + { journal empty$ + { "need key or journal for " cite$ * " to crossref " * crossref * + warning$ + "" + } + { "in {\em " journal * "\/}" * } + if$ + } + { "in " key * } + if$ + " \cite{" * crossref * "}" * +} + +FUNCTION {format.crossref.editor} +{ editor #1 "{vv~}{ll}" format.name$ + editor num.names$ duplicate$ + #2 > + { pop$ " {\em et~al.}" * } + { #2 < + 'skip$ + { editor #2 "{ff }{vv }{ll}{ jj}" format.name$ "others" = + { " {\em et~al.}" * } + { " and " * editor #2 "{vv~}{ll}" format.name$ * } + if$ + } + if$ + } + if$ +} + +FUNCTION {format.book.crossref} +{ volume empty$ + { "empty volume in " cite$ * "'s crossref of " * crossref * warning$ + "In " + } + { "Vol.~" volume * + " of " * + } + if$ + editor empty$ + editor field.or.null author field.or.null = + or + { key empty$ + { series empty$ + { "need editor, key, or series for " cite$ * " to crossref " * + crossref * warning$ + "" * + } + { "{\em " * series * "\/}" * } + if$ + } + { key * } + if$ + } + { format.crossref.editor * } + if$ + " \cite{" * crossref * "}" * +} + +FUNCTION {format.incoll.inproc.crossref} +{ editor empty$ + editor field.or.null author field.or.null = + or + { key empty$ + { booktitle empty$ + { "need editor, key, or booktitle for " cite$ * " to crossref " * + crossref * warning$ + "" + } + { "in {\em " booktitle * "\/}" * } + if$ + } + { "in " key * } + if$ + } + { "in " format.crossref.editor * } + if$ + " \cite{" * crossref * "}" * +} + +FUNCTION {default.name.doi.prefix} +{ + "doi: " +} + +FUNCTION {no.blank.or.punct} +{ "" * before.all 'output.state := +} + +FUNCTION {add.semicolon} +{ + ";" * + no.blank.or.punct +} + +FUNCTION {add.comma} +{ + "," * + no.blank.or.punct +} + +FUNCTION {add.colon} +{ + ": " * + no.blank.or.punct +} + +FUNCTION {add.space} +{ + " " * + no.blank.or.punct +} + +FUNCTION {doi.base} +{ + "http://dx.doi.org/" +} + +FUNCTION {init.bib.doi} +{ + "\providecommand \doibase [0]{" doi.base "}%" * * write$ newline$ +} + +FUNCTION {doi.base.command} +{ + "\doibase " +} + +FUNCTION {noop.command} +{ + "\href@noop " +} + +FUNCTION {href.command} +{ + "\href " +} + +FUNCTION {link.tag.open} +{ + doi duplicate$ empty$ + { + pop$ + url duplicate$ empty$ + { + pop$ "" noop.command + }{ + href.command + } + if$ + } + { + doi.base.command swap$ * + href.command + } + if$ + "{" * swap$ * "} {" * +} + +FUNCTION {link.tag.shut} +{ + "}" +} + +FUNCTION {add.doi} +{ + link.tag.open swap$ * link.tag.shut * +} + +FUNCTION {format.doi} +{ doi empty$ + { "" } + { default.name.doi.prefix doi * + link.tag.open swap$ * link.tag.shut * + } + if$ +} + +FUNCTION {article} +{ output.bibitem + format.authors "author" output.check + format.title "title" output.check + blank.sep + crossref missing$ + { format.journal emphasize "journal" output.check + format.date "year" output.check + format.volume output + format.number output + format.pages output + } + { format.article.crossref output.nonnull + format.pages output + } + if$ + new.block + note output + format.doi output + fin.entry +} + +FUNCTION {book} +{ output.bibitem + author empty$ + { format.editors "author and editor" output.check } + { format.authors output.nonnull + crossref missing$ + { "author and editor" editor either.or.check } + 'skip$ + if$ + } + if$ + format.btitle "title" output.check + crossref missing$ + { format.bvolume output + new.block + format.number.series output + format.addr.pub "publisher" output.check + } + { new.block + format.book.crossref output.nonnull + } + if$ + new.block + format.edition output + format.date "year" output.check + new.block + note output + fin.entry +} + +FUNCTION {booklet} +{ output.bibitem + format.authors output + title empty$ + { "empty title in " cite$ * warning$ + howpublished new.sentence.checka + } + { howpublished empty$ not + address empty$ month empty$ year empty$ and and + or + { format.title.p output.nonnull } + { format.title output.nonnull } + if$ + blank.sep + } + if$ + howpublished add.colon add.space output + address add.period$ add.space output + format.date output + new.block + note output + fin.entry +} + +FUNCTION {inbook} +{ output.bibitem + author empty$ + { format.editors "author and editor" output.check } + { format.authors output.nonnull + crossref missing$ + { "author and editor" editor either.or.check } + 'skip$ + if$ + } + if$ + format.btitle "title" output.check + crossref missing$ + { format.bvolume add.period$ add.space output + new.block + format.chapter.pages add.semicolon add.space "chapter and pages" output.check + new.block + format.number.series add.period$ add.space output + format.addr.pub "publisher" output.check + new.block + } + { format.chapter.pages "chapter and pages" output.check + new.block + format.book.crossref output.nonnull + } + if$ + format.edition output + format.date "year" output.check + new.block + note output + fin.entry +} + +FUNCTION {incollection} +{ output.bibitem + format.authors "author" output.check + format.title "title" output.check + blank.sep + crossref missing$ + { format.in.ed.booktitle "booktitle" output.check + new.block + format.number.series add.period$ add.space output + format.bvolume add.period$ add.space output + format.addr.pub add.period$ add.space "publisher" output.check + format.edition output + format.date "year" output.check + %%format.chapter.pages output + incollecformat.pages output + } + { format.incoll.inproc.crossref output.nonnull + format.chapter.pages output + } + if$ + new.block + note output + fin.entry +} + +FUNCTION {inproceedings} +{ output.bibitem + format.authors "author" output.check + format.title "title" output.check + blank.sep + crossref missing$ + { format.in.ed.booktitle "booktitle" output.check + format.bvolume add.period$ add.space output + format.number.series add.period$ add.space output + organization add.period$ add.space output + publisher output + inproformat.date "year" output.check + format.ppaddress output + format.pages output + } + { format.incoll.inproc.crossref output.nonnull + format.pages output + } + if$ + new.block + note output + fin.entry +} + +FUNCTION {conference} { inproceedings } + +FUNCTION {manual} +{ output.bibitem + author empty$ + { organization empty$ + 'skip$ + { organization output.nonnull + address output + } + if$ + } + { format.authors output.nonnull } + if$ + format.btitle add.period$ add.space "title" output.check + author empty$ + { organization empty$ + { address new.block.checka + address output + } + 'skip$ + if$ + } + { organization address new.block.checkb + organization add.semicolon add.space output + address add.colon output + } + if$ + format.edition output + format.date output + new.block + note output + fin.entry +} + +FUNCTION {mastersthesis} +{ output.bibitem + format.authors "author" output.check + format.title add.space "title" output.check + blank.sep + "Master's thesis" format.thesis.type add.period$ add.space output.nonnull + school add.period$ add.space "school" output.check + address add.colon add.space output + format.date "year" output.check + new.block + note output + fin.entry +} + +FUNCTION {misc} +{ output.bibitem + format.authors output + title empty$ + { howpublished new.sentence.checka } + { howpublished empty$ not + month empty$ year empty$ and + or + { format.title.p output.nonnull } + { format.title output.nonnull } + if$ + blank.sep + } + if$ + howpublished add.semicolon add.space output + format.date output + new.block + note output + fin.entry + empty.misc.check +} + +FUNCTION {phdthesis} +{ output.bibitem + format.authors "author" output.check + format.btitle add.period$ add.space "title" output.check + new.block + "PhD thesis" format.thesis.type add.period$ add.space output.nonnull + school add.comma add.space "school" output.check + address add.semicolon add.space output + format.date "year" output.check + new.block + note output + fin.entry +} + +FUNCTION {proceedings} +{ output.bibitem + editor empty$ + { organization output } + { format.editors add.comma add.space output.nonnull } + if$ + format.btitle "title" output.check + format.bvolume output + format.number.series add.space output + format.paddress add.semicolon add.space output + editor empty$ + 'skip$ + { organization add.comma add.space output } + if$ + publisher add.colon add.space output + format.date "year" output.check + new.block + note output + fin.entry +} + +FUNCTION {techreport} +{ output.bibitem + format.authors "author" output.check + format.title add.space "title" output.check + blank.sep + format.tr.number add.comma add.space output.nonnull + institution add.semicolon add.space "institution" output.check + address add.colon add.space output + format.date "year" output.check + new.block + note output + fin.entry +} + +FUNCTION {unpublished} +{ output.bibitem + format.authors "author" output.check + format.title.p "title" output.check + blank.sep + note add.semicolon add.space "note" output.check + format.date output + fin.entry +} + +FUNCTION {default.type} { misc } + +MACRO {jan} {"Jan."} + +MACRO {feb} {"Feb."} + +MACRO {mar} {"Mar."} + +MACRO {apr} {"Apr."} + +MACRO {may} {"May"} + +MACRO {jun} {"June"} + +MACRO {jul} {"July"} + +MACRO {aug} {"Aug."} + +MACRO {sep} {"Sept."} + +MACRO {oct} {"Oct."} + +MACRO {nov} {"Nov."} + +MACRO {dec} {"Dec."} + +MACRO {acmcs} {"ACM Computing Surveys"} + +MACRO {acta} {"Acta Informatica"} + +MACRO {cacm} {"Communications ACM"} + +MACRO {ibmjrd} {"IBM J. Research and Development"} + +MACRO {ibmsj} {"IBM Systems~J."} + +MACRO {ieeese} {"IEEE Trans. Software Engineering"} + +MACRO {ieeetc} {"IEEE Trans. Computers"} + +MACRO {ieeetcad} + {"IEEE Trans. Computer-Aided Design"} + +MACRO {ipl} {"Information Processing Letters"} + +MACRO {jacm} {"J.~ACM"} + +MACRO {jcss} {"J.~Computer and System Sciences"} + +MACRO {scp} {"Science of Computer Programming"} + +MACRO {sicomp} {"SIAM J. Computing"} + +MACRO {tocs} {"ACM Trans. Computer Systems"} + +MACRO {tods} {"ACM Trans. Database Systems"} + +MACRO {tog} {"ACM Trans. Graphics"} + +MACRO {toms} {"ACM Trans. Mathematical Software"} + +MACRO {toois} {"ACM Trans. Office Information Systems"} + +MACRO {toplas} {"ACM Trans. Programming Languages and Systems"} + +MACRO {tcs} {"Theoretical Computer Science"} + +READ + +STRINGS { longest.label } + +INTEGERS { number.label longest.label.width } + +FUNCTION {initialize.longest.label} +{ "" 'longest.label := + #1 'number.label := + #0 'longest.label.width := +} + +FUNCTION {longest.label.pass} +{ number.label int.to.str$ 'label := + number.label #1 + 'number.label := + label width$ longest.label.width > + { label 'longest.label := + label width$ 'longest.label.width := + } + 'skip$ + if$ +} + +EXECUTE {initialize.longest.label} + +ITERATE {longest.label.pass} + +FUNCTION {begin.bib} +{ preamble$ empty$ + 'skip$ + { preamble$ write$ newline$ } + if$ + "\begin{thebibliography}{" longest.label * "}" * write$ newline$ + init.bib.doi +} + +EXECUTE {begin.bib} + +EXECUTE {init.state.consts} + +ITERATE {call.type$} + +FUNCTION {end.bib} +{ newline$ + "\end{thebibliography}" write$ newline$ +} + +EXECUTE {end.bib} \ No newline at end of file diff --git a/paper/figures/figure1_geographic_distribution.pdf b/paper/figures/figure1_geographic_distribution.pdf index 480eadf..9782e53 100644 Binary files a/paper/figures/figure1_geographic_distribution.pdf and b/paper/figures/figure1_geographic_distribution.pdf differ diff --git a/paper/figures/figure1b_regional_distribution.pdf b/paper/figures/figure1b_regional_distribution.pdf index caca5c6..615db04 100644 Binary files a/paper/figures/figure1b_regional_distribution.pdf and b/paper/figures/figure1b_regional_distribution.pdf differ diff --git a/paper/figures/figure2_temporal_trends.pdf b/paper/figures/figure2_temporal_trends.pdf index 507fca8..a4f9dc8 100644 Binary files a/paper/figures/figure2_temporal_trends.pdf and b/paper/figures/figure2_temporal_trends.pdf differ diff --git a/paper/figures/figure4_protocol_usage.pdf b/paper/figures/figure4_protocol_usage.pdf index 6b7d3fe..bcdd9a3 100644 Binary files a/paper/figures/figure4_protocol_usage.pdf and b/paper/figures/figure4_protocol_usage.pdf differ diff --git a/paper/figures/figure5_dataset_concentration.pdf b/paper/figures/figure5_dataset_concentration.pdf index 6efe73a..2ef28f5 100644 Binary files a/paper/figures/figure5_dataset_concentration.pdf and b/paper/figures/figure5_dataset_concentration.pdf differ diff --git a/paper/figures/figure6_top_datasets.pdf b/paper/figures/figure6_top_datasets.pdf index bc952fb..1395745 100644 Binary files a/paper/figures/figure6_top_datasets.pdf and b/paper/figures/figure6_top_datasets.pdf differ diff --git a/paper/figures/figure7_country_bubble.pdf b/paper/figures/figure7_country_bubble.pdf index 5ea9505..ee1f17d 100644 Binary files a/paper/figures/figure7_country_bubble.pdf and b/paper/figures/figure7_country_bubble.pdf differ diff --git a/paper/figures/figure_bot_detection_overview.pdf b/paper/figures/figure_bot_detection_overview.pdf index 53f87b6..32f3b70 100644 Binary files a/paper/figures/figure_bot_detection_overview.pdf and b/paper/figures/figure_bot_detection_overview.pdf differ diff --git a/paper/figures/figure_filetype_by_region.pdf b/paper/figures/figure_filetype_by_region.pdf index a2c7368..bac6713 100644 Binary files a/paper/figures/figure_filetype_by_region.pdf and b/paper/figures/figure_filetype_by_region.pdf differ diff --git a/paper/figures/figure_hub_distribution.pdf b/paper/figures/figure_hub_distribution.pdf index 030dc4a..b8fbd68 100644 Binary files a/paper/figures/figure_hub_distribution.pdf and b/paper/figures/figure_hub_distribution.pdf differ diff --git a/paper/figures/supp_figure_agreement.pdf b/paper/figures/supp_figure_agreement.pdf index 4aa4845..0a47ac9 100644 Binary files a/paper/figures/supp_figure_agreement.pdf and b/paper/figures/supp_figure_agreement.pdf differ diff --git a/paper/figures/supp_pride_overview.png b/paper/figures/supp_pride_overview.png index 43b5119..a851595 100644 Binary files a/paper/figures/supp_pride_overview.png and b/paper/figures/supp_pride_overview.png differ diff --git a/paper/main.pdf b/paper/main.pdf index 29ebab4..79ed30d 100644 Binary files a/paper/main.pdf and b/paper/main.pdf differ diff --git a/paper/main.tex b/paper/main.tex index 0389fc4..12e8ed6 100644 --- a/paper/main.tex +++ b/paper/main.tex @@ -25,13 +25,13 @@ \definecolor{automatedorange}{RGB}{230,126,34} % Title -\title{\textbf{Tracking Dataset Reuse in Proteomics: A Comprehensive Analysis of PRIDE Archive Downloads}} +\title{\textbf{Tracking dataset reuse in proteomics: a comprehensive analysis of PRIDE data download statistics}} \author{ Suresh Hewapathirana$^{1}$, Jingwen Bai$^{1}$, Chakradhar Bandla$^{1}$,\\ Selvakumar Kamatchinathan$^{1}$, Deepti J Kundu$^{1}$, Nithu Sara John$^{1}$,\\ Boma Brown-Harry$^{1}$, Nandana Madhusoodanan$^{1}$,\\ - Marc Riera Duocastella$^{1}$, Juan Antonio Vizca\'{i}no$^{1}$, Yasset Perez-Riverol$^{1,*}$\\[0.5em] + Joan Marc Riera Duocastella$^{1}$, Juan Antonio Vizca\'{i}no$^{1}$, Yasset Perez-Riverol$^{1,*}$\\[0.5em] \small $^{1}$European Molecular Biology Laboratory, European Bioinformatics Institute (EMBL-EBI),\\ \small Wellcome Trust Genome Campus, Hinxton, Cambridge CB10 1SD, UK\\ \small $^{*}$Corresponding author: yperez@ebi.ac.uk @@ -45,7 +45,7 @@ % Abstract \begin{abstract} -Understanding how scientific datasets are accessed and reused is essential for resource planning, impact assessment, and the evaluation of open science policies. Here we present the PRIDE Archive download tracking infrastructure and a comprehensive analysis of download records from the PRIDE proteomics database (2020-2025), spanning 34,085 datasets accessed from 213 countries. The infrastructure includes \texttt{nf-downloadstats}, a scalable Nextflow pipeline for processing large-scale download logs, and DeepLogBot, a deep-learning bot detection framework. After removing 88.0\% of download traffic identified as automated, analysis of the remaining 19.1 million genuine downloads reveals that the United States leads PRIDE data reuse (26.8\%), followed by the United Kingdom (23.6\%) and Germany (22.5\%). Downloads grew steadily from 1.2 million in 2021 to 5.8 million in 2025, with FTP and HTTP alternating as the dominant download protocol. Dataset reuse follows a highly concentrated distribution, with the top 1\% of datasets accounting for 43.3\% of all downloads. These findings provide actionable insights for repository infrastructure planning and demonstrate the importance of bot-aware analytics for scientific data resources. +Understanding how scientific datasets are accessed and reused is essential for resource planning, impact assessment, and the evaluation of open science policies. Here we present the PRIDE Archive download tracking infrastructure and a comprehensive analysis of data download statistics records from the PRIDE proteomics database during the last 5 years (2021-2025), spanning 34,085 datasets (96.4\% of all public PRIDE datasets) accessed from 213 countries. The infrastructure includes \texttt{nf-downloadstats}, a scalable open Nextflow pipeline for processing large-scale download logs, and DeepLogBot, an open deep-learning bot detection framework. After removing 88.0\% of download traffic identified as automated, analysis of the remaining 19.1 million genuine individual file downloads reveals that the United States leads PRIDE data downloads (26.8\% of all downloaded files), followed by the United Kingdom (23.6\%) and Germany (22.5\%). Downloads grew steadily from 1.2 million in 2021 to 5.3 million in 2025, with FTP and HTTP alternating as the dominant download protocol. Dataset reuse follows a highly concentrated distribution, with the top 1\% of datasets accounting for 43.3\% of all downloads. These findings provide actionable insights for repository infrastructure planning and demonstrate the importance of bot-aware analytics for scientific data resources. \begin{sloppypar} \noindent\textbf{Availability:} The \texttt{nf-downloadstats} pipeline is available at \url{https://github.com/PRIDE-Archive/nf-downloadstats} and DeepLogBot at \url{https://github.com/ypriverol/deeplogbot}, both under the Apache 2.0 license. @@ -58,39 +58,37 @@ \section{Introduction} % ====================================================================== -The Proteomics Identification database (PRIDE) is a leading public mass spectrometry-based proteomics data repository \citep{PerezRiverol2025}. As a founding member of the ProteomeXchange consortium, PRIDE enables researchers to share and access high-quality proteomics datasets globally, promoting transparency, reproducibility, and data reuse. Aligned with the FAIR principles - Findable, Accessible, Interoperable, and Reusable \citep{Wilkinson2016} - PRIDE supports open science by ensuring that public datasets are well-annotated and machine-readable. These principles are essential for maximizing the value of shared scientific data. +The PRIDE database is the world-leading data repository for mass spectrometry-based proteomics \citep{PerezRiverol2025}. As a founding member of the ProteomeXchange consortium \citep{Deutsch2026}, PRIDE enables researchers to share and access proteomics datasets globally, promoting transparency, reproducibility, and data reuse. Aligned with the FAIR principles - Findable, Accessible, Interoperable, and Reusable \citep{Wilkinson2016} - PRIDE supports open science by ensuring that public datasets are well-annotated and machine-readable. These principles are essential for maximizing the value of shared scientific data. -Understanding how public datasets are reused is essential for assessing their scientific impact and informing data-driven policies. While citations in scholarly publications offer one indicator of reuse, download statistics provide a more immediate and granular view of data demand and utility. In our previous work \citep{PerezRiverol2019}, we demonstrated that usage metrics can serve as complementary indicators of impact, supporting improved data stewardship, resource allocation, and funding decisions. Beyond measuring impact, download statistics are critical for designing more effective data infrastructures. Insights into download behavior can inform the optimization of data access protocols, guide the prioritization of metadata curation and visualization features, and identify high-value datasets for targeted annotation or integration efforts. As public data volumes continue to grow, usage-driven strategies become increasingly important for improving dataset discoverability and reuse. For example, frequently downloaded datasets - particularly those used as community benchmarks - could be prioritized for enhanced metadata annotation (e.g., SDRF sample descriptions \citep{Dai2021}) and enriched with curated tags and keywords, making them easier to find through search interfaces. Users could then combine these annotations with download counts to identify the most relevant and community-validated datasets for their analyses. Similarly, repositories can leverage download patterns to allocate faster transfer services and optimized storage for high-demand datasets, ensuring that the most reused data remain readily accessible. +Understanding how much public datasets are reused is essential for assessing the scientific impact of a data resource such as PRIDE, and informing data-driven policies. While citations in scholarly publications offer one indicator of reuse, data download statistics can provide a more granular view of data demand from any data resource. In our previous work \citep{PerezRiverol2019}, we demonstrated that usage/downloads metrics can serve as complementary indicators of scientific impact for a given publication, supporting improved data stewardship, resource allocation, and funding decisions. Beyond measuring the global impact of a data resource, download statistics are critical for designing more effective data infrastructures. Download patterns can inform the optimization of data access protocols, guide the prioritization of metadata curation and visualization features, and identify high-value datasets for targeted annotation or integration efforts. As public data volumes continue to grow, usage-driven strategies become increasingly important for improving dataset discoverability and reuse. For example, in the concrete case of PRIDE, frequently downloaded datasets - particularly those used as community benchmarks - could be prioritized for enhanced manual metadata curation/annotation (e.g., SDRF-based sample descriptions \citep{Dai2021}) and enriched with curated tags and keywords, making them easier to find through search interfaces. Users could then combine these annotations with download counts as a proxy to identify the most relevant and community-validated datasets for reuse. Similarly, repositories can leverage download patterns to allocate faster transfer services and optimized storage for high-demand datasets, ensuring that the most reused data remain readily accessible. In contrast; knowing which datasets are less frecuently downloaded can help repositories to identify underused datasets that may benefit from improved metadata and file organization. -Despite their importance, systematic tracking of dataset downloads remains a major challenge across bioinformatics resources, PRIDE and other ProteomeXchange partners. Barriers include the absence of a standardized infrastructure for logging access events, technical complexities in aggregating usage data across distributed and heterogeneous transfer systems (e.g., FTP, HTTP), and ongoing concerns related to user privacy and data protection. Compounding these challenges, automated bot traffic contaminates download statistics - studies estimate that bots account for 30-70\% of all internet traffic (https://cpl.thalesgroup.com/ppc/application-security/bad-bot-report), and scientific repositories are particularly attractive targets due to their open-access policies and valuable content \citep{Orr2025}. Without accounting for this contamination, any analysis of repository usage risks drawing conclusions from inflated and distorted metrics. As bioinformatics resources continue to scale in both size and complexity, robust download analytics will become increasingly vital - not only for measuring impact - but also for enabling smarter, user-informed development of open data platforms. +Despite their importance, systematic tracking of dataset downloads remains a major challenge across bioinformatics resources in general, including PRIDE and the other ProteomeXchange partners. Despite some common access statistics having been widely adopted across data resources \citep{Perez2019}, barriers include the absence of a standardized infrastructure for logging access events, technical complexities in aggregating usage data across distributed and heterogeneous transfer systems (e.g., FTP, HTTP), and ongoing concerns related to user privacy and data protection. Compounding these challenges, automated bot traffic contaminates download statistics - studies estimate that bots account for 30-70\% of all internet traffic (https://cpl.thalesgroup.com/ppc/application-security/bad-bot-report), and scientific repositories are particularly attractive targets due to their open-access policies and valuable content \citep{Orr2025}. Without accounting for this contamination, any analysis of repository usage risks drawing conclusions from inflated and distorted metrics. As bioinformatics resources continue to scale in both size and complexity, robust download analytics will become increasingly vital - not only for measuring impact - but also for enabling smarter, user-informed development of open data platforms. \begin{sloppypar} -Here, we present the PRIDE Archive downloads infrastructure, which includes \texttt{nf-downloadstats}, a large-scale Nextflow \citep{DiTommaso2017} workflow for processing extensive traffic logs, and DeepLogBot, a bot detection framework that implements two complementary algorithms to identify and remove automated traffic. Additionally, we have developed an infrastructure that integrates download statistics directly into the PRIDE Archive interface, allowing users to sort datasets by total downloads - both raw and normalized by file count - and perform percentile-based analyses. Using these tools on 159.3 million download records from 2020 to 2025, we characterize global proteomics data reuse patterns, examining geographic distribution across countries and regions, temporal evolution of download activity, shifts in download protocol preferences, and the concentration of dataset reuse across the PRIDE collection. The workflow and resulting data are open-source and can be used by research labs and data depositors in grant reports and publications. +Here, we present the PRIDE Archive download tracking open infrastructure, which includes \texttt{nf-downloadstats}, a large-scale Nextflow \citep{DiTommaso2017} workflow for processing extensive traffic logs, and DeepLogBot, a bot detection framework that implements two complementary algorithms to identify and remove automated traffic. Additionally, we have developed an infrastructure that integrates download statistics directly into the PRIDE Archive web interface, allowing users to sort datasets (e.g. as the result of searches) by total downloads - both raw and normalized by number of files in the dataset - and perform percentile-based analyses. Using these tools on 159.3 million download records spanning 2021 to 2025, we characterize global proteomics data download patterns as a proxy for data reuse, examining geographic distribution across countries and regions, temporal evolution of download activity, shifts in download protocol preferences, and the concentration of dataset downloads across the PRIDE collection. The workflow is open-source and we make available the resulting data, which can be used by PRIDE data submitters in e.g. grant reports and publications. \end{sloppypar} % ====================================================================== \section{Methods} % ====================================================================== -\subsection{PRIDE Download Logs} - -Download logs are stored in a secure file system as compressed, comma-delimited text files. Each log entry includes a timestamp, dataset accession, filename, anonymized and non-reversible IP hash, download status, user country, download protocol (Globus, HTTP, Aspera, FTP), and dataset type. No personal or directly identifiable user data is stored. Log files are organized hierarchically by Protocol, Public/Private access, Year, Month, and Day. Individual log files can be large, ranging from 1~GB to 237~GB (Supplementary Notes Section~S1). - \subsection{nf-downloadstats: Log Processing Pipeline} +Download logs are stored in the EBI (European Bioinformatics Institute) file system as compressed, comma-delimited text files. Each log entry includes a timestamp, dataset accession, filename, anonymized and non-reversible IP hash, download status, geographic location (geo), used download protocol (Globus, HTTP, Aspera, FTP), and dataset type. No personal or directly identifiable user data is stored. Log files are organized hierarchically by Protocol, Public/Private access, Year, Month, and Day. Individual log files can be large, ranging from 1~GB to 237~GB (Supplementary Notes Section~S1). + \begin{sloppypar} Due to the large volume and size of download log files, we developed \texttt{nf-downloadstats} (\url{https://github.com/PRIDE-Archive/nf-downloadstats}), an open-source Nextflow workflow for large-scale processing of the original anonymized log files (Figure~\ref{fig:bot_overview}A). Each processing step is implemented as an independent Python module, performing tasks such as removing incomplete transfers and filtering for PRIDE-specific records. A custom log file parser addresses the heterogeneity and scale of download log data; due to inconsistencies in log entries such as variable column structures and incomplete records, additional filters retain only complete transfers associated with PRIDE datasets. \end{sloppypar} -To efficiently process the large volume of log files, parallelization is employed using a high-performance computing (HPC) environment managed via Slurm. Log files are processed in batches, with batch sizes and filtering criteria defined in a user-friendly YAML configuration file. The output is a consolidated 4.7~GB Parquet file containing 159,327,635 individual download records spanning January 2020 through January 2025. Each record includes the download date, geographic location (derived from IP geolocation), country, dataset accession, filename, download method (protocol), and an anonymized user identifier. The data covers 35,528 unique dataset accessions accessed from 235 countries. +To efficiently process the large volume of log files, parallelization is employed using a high-performance computing (HPC) environment managed via Slurm. Log files are processed in batches, with batch sizes and filtering criteria defined in a user-friendly YAML configuration file. The output is a consolidated 4.7~GB Parquet file containing 159,327,635 individual download records spanning five years: from January 2021 through December 2025. Each record includes the download date, geolocation, dataset accession, filename, download method (protocol). The data covers 35,528 unique dataset accessions accessed from 235 countries. -For analysis, individual download events are aggregated at the \textit{location} level, where each location represents a unique geographic coordinate. This aggregation produces 71,133 location profiles, each characterized by behavioral features including download volume, user counts, temporal patterns, and access characteristics. +For analysis, individual download events are aggregated at the \textit{location} level, where each location represents a unique geographic coordinate. This aggregation produces 71,133 location profiles, each characterized by behavioral features including download volumes, user counts, temporal patterns, and access characteristics. \subsection{Bot Detection Framework} We implemented two complementary bot detection algorithms, each combining Isolation Forest anomaly detection \citep{Liu2008} with a distinct classification strategy. Both methods share a common feature extraction pipeline that computes 90+ behavioral features per location, organized into four categories: activity features (download counts, user statistics), temporal features (hourly/yearly entropy, working hours ratio), behavioral features (burst patterns, circadian deviation, coordination scores), and discriminative features (file exploration patterns, user authenticity scores). Full feature descriptions are provided in Supplementary Notes Section~S3. -The \textit{rule-based} method applies YAML-configurable threshold patterns to classify each location into one of three categories: \textbf{bot} (automated scraping or crawling), \textbf{hub} (legitimate automation such as institutional mirrors or CI/CD pipelines), or \textbf{organic} (human researchers). Patterns are evaluated sequentially, with the first match determining classification. This approach prioritizes interpretability and configurability. The \textit{deep} method augments rule-based classification with additional behavioral feature engineering, including bot interaction features (download concentration, temporal irregularity, composite bot score) and bot signature features (request velocity, access regularity, session anomaly patterns). These 40+ additional features enable more nuanced separation of the three categories. The method incorporates a two-stage classification pipeline: Stage~1 separates organic from automated traffic, and Stage~2 distinguishes malicious bots from legitimate automation (hubs) using a discriminative scoring system with behavioral validation. +The \textit{rule-based} method applies YAML-configurable threshold patterns to classify each location into one of three categories: \textbf{bot} (automated scraping or crawling), \textbf{hub} (legitimate automation such as institutional mirrors or continuous integration/continuous deployment (CI/CD) pipelines), or \textbf{organic} (human researchers). Patterns are evaluated sequentially, with the first match determining classification. This approach prioritizes interpretability and configurability. The \textit{deep} method augments rule-based classification with additional behavioral feature engineering, including bot interaction features (download concentration, temporal irregularity, composite bot score) and bot signature features (request velocity, access regularity, session anomaly patterns). These 40+ additional features enable more nuanced separation of the three categories. The method incorporates a two-stage classification pipeline: Stage~1 separates organic from automated traffic, and Stage~2 distinguishes malicious bots from legitimate automation (hubs) using a discriminative scoring system with behavioral validation. To evaluate both algorithms, we constructed a ground truth dataset of 1,411 labeled locations (88 bots, 44 hubs, 1,279 organic) using high-confidence heuristic criteria (Supplementary Notes Section~S5). Both methods were evaluated on a 1-million record sample; we computed precision, recall, and F1 score per category, with 1,000-iteration bootstrap confidence intervals for the macro F1. The Deep method achieves the highest macro F1 score (0.775, 95\% CI: 0.731-0.818), with perfect bot recall (1.000) and strong hub detection (F1 = 0.718), while the Rules method provides higher bot precision (0.506) but detects fewer hubs (F1 = 0.275, Supplementary Notes Section~S6.) @@ -103,35 +101,33 @@ \subsection{Bot Detection Framework} \label{fig:bot_overview} \end{figure} -\subsection{PRIDE Download Statistics Visualization} - -The aggregated download statistics produced by \texttt{nf-downloadstats} are stored in MongoDB and Elasticsearch to enable fast searching and visualization within the PRIDE Archive web interface. For each dataset, the total number of downloads is displayed alongside a gradient bar indicating its download percentile, with higher intensity representing datasets in the top 1\% of downloads (Figure~\ref{fig:pride_viz}). Additionally, a yearly trend chart is provided for each dataset, illustrating download activity over time. Datasets can be sorted by both total downloads per project and a normalized metric that accounts for the number of files within each project, enabling users to identify highly reused datasets for benchmarking or reanalysis. - -\begin{figure}[H] -\centering -\includegraphics[width=0.75\textwidth]{figures/figure_pride_visualization.png} -\caption{PRIDE Archive download statistics integration. Each dataset displays a gradient percentile bar and total download count. A popup trend chart shows yearly download activity, enabling users to assess dataset popularity and reuse trends directly within the PRIDE interface.} -\label{fig:pride_viz} -\end{figure} - % ====================================================================== \section{Results} % ====================================================================== \subsection{Global PRIDE Usage Patterns} -The PRIDE Archive serves users in 213 countries, demonstrating truly global reach (Figure~\ref{fig:geographic}). The top five countries by download volume are the United States (5.1M downloads, 26.8\%), the United Kingdom (4.5M, 23.6\%), Germany (4.3M, 22.5\%), South Korea (869K, 4.6\%), and Canada (691K, 3.6\%). Europe accounts for the largest share of downloads (55.7\%), followed by the Americas (30.7\%) and Asia (11.4\%). +Over the 2021--2025 study period, 19.1 million genuine individual file downloads were recorded across 34,085 datasets, representing 96.4\% of all public PRIDE datasets, accessed from 213 countries. On average, each dataset has been downloaded 560 times (median 85), and the number of unique datasets accessed per year grew from 14,879 in 2021 to 33,376 in 2025, reflecting both the growing PRIDE collection and increasing data download. Download activity has grown substantially (Figure~\ref{fig:temporal}A): annual downloads increased from 1.2 million in 2021 to 3.5 million in 2022, reaching 5.3 million in 2025, representing a 352\% growth over the five-year period. + +\begin{figure}[H] +\centering +\includegraphics[width=\textwidth]{figures/figure2_temporal_trends.pdf} +\caption{Temporal trends in PRIDE usage. (A) Annual download volumes in millions of files. (B) Growth in unique datasets accessed and unique locations per year.} +\label{fig:temporal} +\end{figure} + +The geographic reach of PRIDE data reuse is truly global (Figure~\ref{fig:geographic}). The top five countries per download volumes are the United States (5.1~M downloads, 26.8\%), the United Kingdom (4.5~M, 23.6\%), Germany (4.3~M, 22.5\%), South Korea (869~K, 4.6\%), and Canada (691~K, 3.6\%). Grouped by continent, Europe accounts for the largest share of downloads (55.7\%), followed by the Americas (30.7\%) and Asia (11.4\%). \begin{figure}[H] \centering \includegraphics[width=0.85\textwidth]{figures/figure1_geographic_distribution.pdf} -\caption{Geographic distribution of PRIDE downloads. Top 20 countries by download volume after bot removal (2020-2025).} +\caption{Geographic distribution of PRIDE downloads. Top 20 countries per download volumes after bot removal (2021-2025).} \label{fig:geographic} \end{figure} -To characterize the relationship between user base size and download intensity, we plotted total downloads against unique users for the top 50 countries (Figure~\ref{fig:bubble}A). Download patterns vary considerably across countries. Some, such as France (10,844 users, 15 downloads/user) and Canada (9,124 users, 76 downloads/user), show broad user bases with moderate per-user activity, suggesting predominantly individual researchers. Others, such as Hong Kong (176 users, 832 downloads/user) and Singapore (703 users, 478 downloads/user), exhibit high per-user averages that likely reflect a small number of heavy institutional users or hub-like access points concentrating the median. Most countries fall on a spectrum between these extremes. This variation highlights that aggregate download volume alone does not fully capture the nature of data reuse, and that the balance between distributed individual access and concentrated institutional access differs markedly across regions. +To characterize the relationship between user base size and download intensity, we plotted total downloads against unique users for the top 50 countries (Figure~\ref{fig:bubble}A). Download patterns vary considerably across countries. Some, such as France (10,844 users, 15 downloads/user) and Canada (9,124 users, 76 downloads/user), show broad user bases with moderate activity per-user, suggesting predominantly individual researchers. Others, such as Belgium (1,858 users, 629 downloads/user) and Denmark (1,301 users, 530 downloads/user), exhibit high per-user averages that likely reflect a smaller number of heavy institutional users or hub-like access points concentrating the download activity. Most countries fall on a spectrum between these extremes. This variation highlights that aggregate download volume alone does not fully capture the nature of data reuse, and that the balance between distributed individual access and concentrated institutional access differs markedly across regions. -Although European countries account for the majority of PRIDE downloads, yearly trends reveal shifting dynamics (Figure~\ref{fig:bubble}B). Germany showed a pronounced peak in 2023 (2.4M downloads) before declining, while the United Kingdom peaked in 2022 (1.8M) with subsequent moderation. Notably, PRIDE usage is growing in low- and middle-income countries (Figure~\ref{fig:bubble}C) including the grow in countries like India (55K downloads), Mexico (23K), and Brazil (12K). This growth suggests that PRIDE is increasingly serving as a resource for researchers in developing nations, supporting broader global participation in proteomics data reuse. +Although European countries account for the majority of PRIDE downloads, yearly trends reveal shifting dynamics (Figure~\ref{fig:bubble}B). Germany showed a pronounced peak in 2023 (2.4~M downloads) before declining, while the United Kingdom peaked in 2022 (1.8~M) with lower average number of data downloads since. Notably, PRIDE usage is growing in some low- and middle-income countries (LMIC, as defined by the Wellcome Trust based on the OECD DAC list \citep{WellcomeLMIC}; Figure~\ref{fig:bubble}C) including India (55~K downloads), Mexico (23~K), and Brazil (12~K). This growth suggests that PRIDE is increasingly serving as a resource for researchers in developing nations, supporting broader global participation in proteomics data reuse. \begin{figure}[H] \centering @@ -140,20 +136,24 @@ \subsection{Global PRIDE Usage Patterns} \label{fig:bubble} \end{figure} -\subsection{Temporal Trends} +\subsection{Download Concentration} -Download activity has grown substantially over the study period (Figure~\ref{fig:temporal}). Annual downloads increased from 1.2 million in 2021 to 3.5 million in 2022, peaking at 5.8 million in 2025 (after removing bots). The number of unique datasets accessed per year grew from 14,879 in 2021 to 18,621 in 2024, reflecting both the growing PRIDE collection and increasing data reuse. +Dataset downloads follows a highly skewed distribution characteristic of heavy-tailed systems (Figure~\ref{fig:concentration}A). The Gini coefficient -- a standard measure of inequality ranging from 0 (perfect equality) to 1 (maximum concentration) -- of 0.84 indicates substantial inequality: the top 1\% of datasets (341 datasets) account for 43.3\% of all downloads, the top 10\% account for 77.2\%, and the bottom 50\% of datasets collectively represent only 3.1\% of downloads. Concretely, whereas the ``average'' dataset has received 85 downloads, the most popular one exceeds 355,000. The rank-frequency distribution reveals a characteristic long tail, with download counts dropping steeply beyond the top 1\% of datasets. + +Importantly, the most downloaded datasets are not one-time events but show sustained behaviour over multiple years (Figure~\ref{fig:concentration}B). Of the top 25 datasets, most have been actively downloaded in at least 4 of the 5 years covered (2021-2025). The most downloaded datasets fall into two main categories: community benchmark resources and tissue atlas datasets. The ProteomeTools synthetic peptide libraries (\href{https://www.ebi.ac.uk/pride/archive/projects/PXD004732}{PXD004732}, 355K downloads; \href{https://www.ebi.ac.uk/pride/archive/projects/PXD021013}{PXD021013}, 303K; \href{https://www.ebi.ac.uk/pride/archive/projects/PXD010595}{PXD010595}, 255K) rank among the top five, reflecting their widespread use as training data for machine learning models and spectral library search engines. Tissue atlas datasets including, the deep proteome atlas of 29 human tissues (\href{https://www.ebi.ac.uk/pride/archive/projects/PXD010154}{PXD010154}, 308K downloads), A Draft Map of the Human Proteome (\href{https://www.ebi.ac.uk/pride/archive/projects/PXD000561}{PXD000561}, 299K), and the Quantitative Proteome Map of the Human Body (\href{https://www.ebi.ac.uk/pride/archive/projects/PXD016999}{PXD016999}, 41K), show sustained multi-year download activity, consistent with their role as reference maps for tissue-specific protein expression. A ranking of the top 20 most downloaded datasets and extended analyses are provided in Supplementary Notes Section~S8. \begin{figure}[H] \centering -\includegraphics[width=\textwidth]{figures/figure2_temporal_trends.pdf} -\caption{Temporal trends in PRIDE usage. (A) Annual download volume in millions. (B) Growth in unique datasets accessed and unique locations per year.} -\label{fig:temporal} +\includegraphics[width=\textwidth]{figures/figure5_dataset_concentration.pdf} +\caption{Dataset downloads concentration and consistency. (A) Rank-frequency distribution on log-log scale; the dashed red line marks the top 1\% boundary, beyond which download counts drop sharply (Gini coefficient = 0.84). (B) Download consistency heatmap for the top 25 most downloaded datasets (2021-2025, after bot removal); color intensity represents download count on a log$_{10}$ scale. Most top datasets show sustained data download volumes across 4-5 years, indicating their role as community reference and benchmark datasets. (C) Relationship between EuropePMC reuse citations and download volume for the top 50 most downloaded datasets (n$-$1 correction to exclude the original publication including the submitted dataset); point color indicates download consistency (fraction of years active out of five), grey points represent datasets with no independent reuse citations.} +\label{fig:concentration} \end{figure} -\subsection{Protocol Usage} +To assess whether download popularity reflects broader scientific impact, we queried (\href{https://europepmc.org/}{EuropePMC}) for ``reuse citations'' of each dataset accession among the top 50 most downloaded datasets, subtracting the original submission publication to count only independent reuse mentions (Figure~\ref{fig:concentration}C). Of these, 37 datasets (74\%) have been cited in at least one independent publication beyond their original submission, with a median of 2 reuse citations and a maximum of 52 (dataset PXD000561). While the correlation between download volume and reuse citation count shows a positive trend, it does not reach statistical significance (Spearman $\rho$ = 0.206, $p$ = 0.22), suggesting that high download counts do not simply mirror citations including data reuse activities. It should be noted that EuropePMC covers only open access publications, which represent a fraction of the total scientific literature, potentially underestimating the true citation-based impact of these datasets. The lack of significant correlation between downloads and reuse citations (Spearman $\rho$ = 0.206, $p$ = 0.22) demonstrates that download statistics capture dimensions of dataset impact that are orthogonal to citation-based metrics: many heavily downloaded datasets have few or no reuse citations, and vice versa. This indicates that download activity -- particularly sustained, multi-year download patterns -- provides an independent and complementary measure of genuine community adoption that cannot be inferred from publication records alone. + +\subsection{File Transfer Protocol Usage} -Download protocol preferences have shifted over the study period (Figure~\ref{fig:protocols}A). FTP was the dominant protocol in 2021, accounting for 65\% of genuine downloads. HTTP overtook FTP in 2022 (54\%), but FTP surged again in 2023 (79\%), likely driven by institutional hub traffic that relies on FTP for bulk transfers. By 2025, HTTP re-emerged as the leading protocol (69\%), reflecting broader adoption of web-based download tools. Despite superior transfer performance for large files, advanced protocols such as Aspera and Globus remain in early adoption stages, accounting for 3.3\% and 1.0\% of 2025 downloads respectively. To lower adoption barriers, we released \texttt{pridepy} \citep{Kamatchinathan2025} in March 2025, a Python-based command-line tool that abstracts protocol complexity and enables seamless switching between FTP, Aspera, and Globus transfers with a single command. A monthly breakdown of 2025 downloads (Figure~\ref{fig:protocols}B) shows emerging Aspera usage alongside sustained Globus adoption, indicating that providing user-friendly tooling can facilitate the transition to high-performance transfer protocols in scientific data repositories. +Download protocol preferences have shifted over the study period (Figure~\ref{fig:protocols}A). FTP was the dominant protocol in 2021, accounting for 65\% of genuine downloads. HTTP overtook FTP in 2022 (54\%), but FTP surged again in 2023 (79\%), likely driven by institutional hub traffic that relies on FTP for bulk transfers. By 2025, HTTP re-emerged as the leading protocol (69\%), reflecting broader adoption of web-based download tools. Despite superior transfer performance for large files, advanced file transfer protocols such as Aspera and Globus remain in early adoption stages, accounting for 3.3\% and 1.0\% of 2025 downloads, respectively. To lower adoption barriers, we released \texttt{pridepy} \citep{Kamatchinathan2025} in March 2025, a Python-based command-line tool that abstracts protocol complexity and enables seamless switching between FTP, Aspera, and Globus file transfers with a single command. A monthly breakdown of 2025 downloads (Figure~\ref{fig:protocols}B) shows emerging Aspera usage alongside sustained Globus adoption, indicating that providing user-friendly tooling can facilitate the transition to high-performance transfer protocols in scientific data repositories. \begin{figure}[H] \centering @@ -162,24 +162,9 @@ \subsection{Protocol Usage} \label{fig:protocols} \end{figure} -\subsection{Download Concentration} - -Dataset reuse follows a highly skewed distribution characteristic of heavy-tailed systems (Figure~\ref{fig:concentration}A). The Gini coefficient of 0.84 indicates substantial inequality: the top 1\% of datasets (341 datasets) account for 43.3\% of all downloads, the top 10\% account for 77.2\%, and the bottom 50\% of datasets collectively represent only 3.1\% of downloads. The median dataset has received 85 downloads, while the most popular exceeds 355,000. The rank-frequency distribution reveals a characteristic long tail, with download counts dropping steeply beyond the top 1\% of datasets. - -Importantly, the most downloaded datasets are not one-time events but show sustained reuse over multiple years (Figure~\ref{fig:concentration}B). Of the top 25 datasets, most have been actively downloaded in at least 4 of the 5 years covered (2021-2025), and PXD000001 - the first dataset deposited in PRIDE - has been downloaded every year. Several datasets exhibit pronounced temporal spikes (e.g., PXD021013 with 303K downloads, PXD029360 with 148K), likely reflecting their use as benchmarks in specific studies or community challenges. Others maintain steady download rates across years (e.g., PXD004732, PXD000561), suggesting their role as long-term reference datasets for the proteomics community. A ranking of the top 20 most downloaded datasets and extended analyses are provided in Supplementary Notes Section~S8. - -\begin{figure}[H] -\centering -\includegraphics[width=\textwidth]{figures/figure5_dataset_concentration.pdf} -\caption{Dataset reuse concentration and consistency. (A) Rank-frequency distribution on log-log scale; the dashed red line marks the top 1\% boundary, beyond which download counts drop sharply (Gini = 0.84). (B) Download consistency heatmap for the top 25 most downloaded datasets (2021-2025, after bot removal); color intensity represents download count on a log$_{10}$ scale. Most top datasets show sustained reuse across 4-5 years, indicating their role as community reference and benchmark datasets. (C) Relationship between EuropePMC reuse citations and download volume for the top 50 most downloaded datasets (n$-$1 correction to exclude the original submission); point color indicates download consistency (fraction of years active out of five), grey points represent datasets with no independent reuse citations.} -\label{fig:concentration} -\end{figure} - -To assess whether download popularity reflects broader scientific impact, we queried EuropePMC for reuse citations of each dataset accession among the top 50 most downloaded datasets, subtracting the original submission publication to count only independent reuse mentions (Figure~\ref{fig:concentration}C). Of these, 37 datasets (74\%) have been cited in at least one independent publication beyond their original submission, with a median of 2 reuse citations and a maximum of 52 (PXD000561). While the correlation between download volume and reuse citation count shows a positive trend, it does not reach statistical significance (Spearman $\rho$ = 0.206, $p$ = 0.22), suggesting that high download counts do not simply mirror publication visibility. This indicates that sustained, multi-year download activity - rather than raw download volume alone - is the stronger signal of genuine community adoption and complements publication-based impact metrics. - \subsection{Download Hubs} -Our classification identified 664 download hubs distributed across 58 countries (Figure~\ref{fig:hubs}), accounting for 18.0 million downloads. These hubs represent institutions that systematically and continuously reanalyze public proteomics data \citep{PerezRiverol2022reanalysis}, including institutional mirrors, research infrastructure nodes, and data aggregation services. The United States hosts the most hubs (155), followed by Germany (99), Japan (46), and the Netherlands (38), with total hub download volume led by the United States (4.7M), the United Kingdom (4.5M), and Germany (4.2M). The geographic spread of hubs - spanning all continents - demonstrates that systematic data reuse is not confined to a few centers but is a global phenomenon. Hub characteristics vary widely: some operate with very few users but extremely high per-user download rates (e.g., Dresden with 275K downloads/user from 8 users, consistent with a mirror), while others involve hundreds of users accessing data at moderate intensity (e.g., Melbourne with 186 users). +Our classification identified 664 institutional download hubs distributed across 58 countries (Figure~\ref{fig:hubs}), accounting for 18.0 million downloads (11.3\% of total traffic). These hubs represent institutions that systematically and continuously download public proteomics data \citep{PerezRiverol2022reanalysis} for data reuse/reanalysis purposes, including institutional mirrors, research infrastructure nodes, and data aggregation services. The United States hosts the most hubs (155, 23.3\%), followed by Germany (99), Japan (46), and the Netherlands (38), with total hub download volume led by the United States (4.7~M), the United Kingdom (4.5~M), and Germany (4.2~M). The geographic spread of hubs -- spanning all six inhabited continents -- demonstrates that systematic data download for data reuse purposes seems not to be confined to a few centers but is a global phenomenon. Hub characteristics vary widely: some operate with very few users but extremely high per-user download rates (e.g., Dresden, Germany with 275~K downloads/user from 8 users, consistent with the behavior of a mirror), while others involve hundreds of users accessing data at a moderate intensity (e.g., Melbourne, Australia with 186 users). \begin{figure}[H] \centering @@ -190,7 +175,7 @@ \subsection{Download Hubs} \subsection{File Type Download Patterns} -More than 81\% of all downloads originate from five countries, and developing countries are largely absent from the top 10. Analysis of file type download patterns across regions reveals distinct usage profiles (Figure~\ref{fig:filetype}): raw instrument files dominate downloads in all regions, accounting for 72-73\% of traffic in East Asia and North America. LMIC countries show a lower raw file proportion (54\%) with a corresponding increase in result files and processed spectra \citep{PerezRiverol2022reanalysis}. This imbalance highlights that most users currently need to download and reprocess raw data from scratch, even when search engine results already exist within the submission - underscoring the need for better infrastructure to make analysis results more discoverable and independently downloadable. +More than 81\% of all downloads originate from five countries, and developing countries are largely absent from the top 10. Analysis of file type download patterns across regions reveals distinct usage profiles (Figure~\ref{fig:filetype}): raw instrument files dominate downloads in all regions, accounting for 72-73\% of traffic in East Asia and North America. LMIC countries show a lower raw file proportion (54\%) with a corresponding increase in result files and processed spectra (peak list files) \citep{PerezRiverol2022reanalysis}. This imbalance highlights that most users currently need to download and reprocess raw data from scratch, even when search engine results already exist within the submission - underscoring the need for better infrastructure to make analysis results more discoverable and independently downloadable. \begin{figure}[H] \centering @@ -199,34 +184,36 @@ \subsection{File Type Download Patterns} \label{fig:filetype} \end{figure} +\subsection{PRIDE Download Statistics Visualization} + +The aggregated download statistics produced by \texttt{nf-downloadstats} are stored in MongoDB and Elasticsearch to enable fast searching and visualization within the PRIDE Archive web interface. For each dataset, the total number of downloads is displayed alongside a gradient bar indicating its download percentile, with higher intensity representing datasets in the top 1\% of downloads (Figure~\ref{fig:pride_viz}). Additionally, a yearly trend chart is provided for each dataset, illustrating download activity over time. Datasets can be sorted by both total downloads per project and a normalized metric that accounts for the number of files within each project, enabling users to identify highly reused datasets for benchmarking or reanalysis. These features allow PRIDE data submitters to use download statistics in grant reports and publications, and provide the community with a tool to discover highly reused datasets. + +\begin{figure}[H] +\centering +\includegraphics[width=0.75\textwidth]{figures/figure_pride_visualization.png} +\caption{PRIDE Archive download statistics integration. Each dataset displays a gradient percentile bar and total download count. A popup trend chart shows yearly download activity, enabling users to assess dataset popularity and reuse trends directly within the PRIDE interface.} +\label{fig:pride_viz} +\end{figure} + % ====================================================================== \section{Discussion} % ====================================================================== \begin{sloppypar} -That 88.0\% of PRIDE download traffic is automated is consistent with the high bot prevalence observed across scientific data repositories, which offer unauthenticated, persistent, and predictable access to valuable content \citep{Imperva2023}. Without filtering, the most ``popular'' datasets may simply be the most bot-targeted, rendering raw download counts unreliable as impact indicators - a consequential problem given that download statistics are increasingly used by funding agencies, institutions, and researchers as evidence of data impact alongside traditional citation metrics \citep{PerezRiverol2019}. DeepLogBot demonstrates that a scalable deep learning approach combining Isolation Forest anomaly detection with behavioral classification can effectively recover genuine usage signals from heavily contaminated logs. The framework processes over 159 million records, generalizes across diverse bot behaviors without manual rule tuning, and distinguishes legitimate automation (institutional hubs) from harmful scraping - a distinction that rule-based methods alone handle poorly (hub F1 = 0.275 vs.\ 0.718). As AI-driven platforms that perform large-scale automated reanalysis become more prevalent, the boundary between harmful scraping and beneficial programmatic access will become increasingly blurred, and repositories will need adaptive classification schemes that evolve alongside legitimate automation patterns. +Here we have performed a detailed study of the PRIDE data download statistics for the last 5 years (2021--2025). Interestingly, 88.0\% of PRIDE download traffic is automated is consistent with the high bot prevalence observed across scientific data repositories, which offer unauthenticated, persistent, and predictable access to valuable content \citep{Orr2025}. Without filtering, the most ``downloaded'' datasets may simply be the most bot-targeted, rendering raw download volumes unreliable as scientific impact indicators -- a consequential problem given that download statistics are increasingly used by funding agencies, institutions, and researchers as evidence of data impact alongside traditional citation metrics \citep{PerezRiverol2019}. DeepLogBot demonstrates that a scalable deep learning approach combining Isolation Forest anomaly detection with behavioral classification can effectively recover genuine usage signals from heavily contaminated logs. The framework processes over 159 million records, generalizes across diverse bot behaviors without manual rule tuning, and distinguishes legitimate automation (institutional hubs) from harmful scraping -- a distinction that rule-based methods alone handle poorly (hub F1 = 0.275 vs.\ 0.718). As AI-driven platforms that perform large-scale automated reanalysis become more prevalent, the boundary between harmful scraping and beneficial programmatic access will become increasingly blurred, and repositories will need adaptive classification schemes that evolve alongside legitimate automation patterns. -PRIDE downloads have grown from 1.2 million in 2021 to 5.8 million in 2025, confirming accelerating data reuse across a geographically broad user base (213 countries). Download intensity varies markedly: some countries exhibit broad individual user bases (e.g., United States with 64K users), while others show concentrated institutional access (e.g., Hong Kong with 832 downloads/user, Singapore with 478 downloads/user), suggesting that the nature of reuse - individual exploration versus systematic reanalysis - differs between research communities. The 664 download hubs we identified reveal a global infrastructure of institutional data consumers, from single-user mirrors performing full-repository synchronization to multi-user reanalysis centers processing hundreds of datasets. This hub distribution provides an empirical map of where proteomics bioinformatics infrastructure exists and can inform ProteomeXchange decisions about mirror placement, edge caching, and regional resource allocation - for instance, countries with growing user bases but no local hubs (e.g., India, Brazil, Mexico) may benefit from targeted infrastructure support. FTP and HTTP have alternated as the dominant protocol, with the shift to HTTP dominance in 2025 (69\%) reflecting both broader individual adoption and the growing importance of API-based programmatic access; advanced protocols (Aspera, Globus) remain in early stages (1.4\% of genuine traffic), but tools such as \texttt{pridepy} \citep{Kamatchinathan2025} should lower adoption barriers as datasets continue to grow in size. +PRIDE download volumes have grown 352\% in 5 years, from 1.2 million in 2021 to 5.3 million in 2025, confirming accelerating data downloads across a geographically broad user base (213 countries). Download intensity varies markedly: some countries exhibit broad individual user bases, while others show concentrated institutional access (e.g., Belgium with 629 downloads/user, Denmark with 530 downloads/user), suggesting that the nature of data downloads -- individual exploration versus systematic reanalysis -- differs between research communities. The 664 download hubs we identified reveal a global infrastructure of institutional data consumers, from single-user mirrors performing full-repository synchronization to multi-user reanalysis centers processing hundreds of datasets. This hub distribution provides an empirical map of where proteomics bioinformatics infrastructure exists and can inform ProteomeXchange decisions about potential PRIDE mirrors placement, edge caching, and regional resource allocation -- for instance, countries with growing user bases but no local hubs (e.g., India, Brazil, Mexico) may benefit from targeted infrastructure support. FTP and HTTP have alternated as the dominant protocol, with the shift to HTTP dominance in 2025 (69\%) reflecting both broader individual adoption and the growing importance of API-based programmatic access; advanced protocols (Aspera, Globus) remain in early stages (1.4\% of genuine traffic), but tools such as \texttt{pridepy} \citep{Kamatchinathan2025} should lower adoption barriers as datasets continue to grow in size. -Dataset reuse is highly concentrated (Gini = 0.84), with the top 1\% of datasets accounting for 43.3\% of all downloads. While community reference datasets such as ProteomeTools (\href{https://www.ebi.ac.uk/pride/archive/projects/PXD004732}{PXD004732}) show sustained multi-year reuse - likely because its comprehensive synthetic peptide spectral libraries serve as training data for machine learning models, retention time predictors, and spectral library search engines across the field - the ``long tail'' of rarely downloaded datasets should not be disregarded: these datasets may gain future value through meta-analyses, machine learning applications, or integration into multi-omics studies. Repositories can better serve both ends of this distribution by investing in improved discoverability - richer metadata, curated tags, and recommendation systems - alongside prioritized access for high-demand datasets. Regional differences in file type usage, with LMIC countries showing higher reliance on processed results rather than raw files, suggest that computational capacity and bandwidth constraints shape reuse patterns. The dominance of raw file downloads across all regions (Figure~\ref{fig:filetype}A) indicates that researchers currently lack easy access to analysis results within submissions, forcing them to re-download and reprocess raw data even when search engine outputs already exist. To address this, the PRIDE team is developing dedicated infrastructure for discovering, browsing, and downloading result and analysis files independently of the full raw dataset, enabling researchers with limited computational resources to directly access quantification tables, identification lists, and processed spectra without the overhead of re-running search engines. In parallel, the PRIDE team will prioritize SDRF sample metadata annotation \citep{Dai2021} for the most downloaded and community-relevant datasets identified in this study, making these high-impact submissions immediately reusable through standardized experimental design descriptions. Several complementary efforts support this vision: \texttt{quantms} \citep{Dai2024} generates standardized reanalysis outputs from public datasets, the PTMExchange initiative (\url{https://www.proteomexchange.org/ptmexchange}) provides curated post-translational modification results, and the PRIDE team is collaborating with developers of widely used search engines - including DIA-NN \citep{Demichev2020}, MaxQuant \citep{Cox2008}, and MSFragger \citep{Kong2017} - to define standardized submission guidelines that ensure result files, quantification tables, and metadata are structured for immediate reuse. More broadly, the \texttt{nf-downloadstats} pipeline and DeepLogBot framework are applicable to any open data repository facing similar challenges, including genomics (ENA/SRA), structural biology (PDB), and metabolomics (MetaboLights) resources. +Dataset downloads are highly concentrated (Gini = 0.84), with the top 1\% of datasets accounting for 43.3\% of all downloads. While community reference datasets such as ProteomeTools (\href{https://www.ebi.ac.uk/pride/archive/projects/PXD004732}{PXD004732}) show sustained multi-year downloads -- likely because its comprehensive synthetic peptide spectral libraries serve as training data for machine learning models, retention time predictors, and spectral library search engines across the field -- the ``long tail'' of rarely downloaded datasets should not be disregarded: these datasets may gain future value through meta-analyses, machine learning applications, or integration into multi-omics studies. Repositories can better serve both ends of this distribution by investing in improved discoverability -- richer metadata, curated tags, and recommendation systems -- alongside prioritized access for high-demand datasets. Regional differences in file type usage, with LMIC countries showing higher reliance on processed results rather than raw files, suggest that computational capacity and bandwidth constraints shape data download patterns. The dominance of raw file downloads across all regions (Figure~\ref{fig:filetype}A) indicates that researchers currently lack easy access to analysis results within submissions, forcing them to re-download and reprocess raw data even when search engine outputs already exist. To address this, the PRIDE team is developing dedicated infrastructure for discovering, browsing, and downloading result and analysis files independently of the full raw dataset, enabling researchers with limited computational resources to directly access quantification tables, identification lists, and processed spectra (peak list files) without the overhead of re-running search engines. In parallel, the PRIDE team will prioritize SDRF sample metadata annotation \citep{Dai2021} for the most downloaded and community-relevant datasets identified in this study, making these high-impact submissions immediately reusable through standardized experimental design descriptions. Several complementary efforts support this vision: \texttt{quantms} \citep{Dai2024} generates standardized reanalysis outputs from public datasets, the PTMExchange initiative (\url{https://www.proteomexchange.org/ptmexchange}) provides harmonised results coming from the reanalysis of PTM-enriched datasets, and the PRIDE team is collaborating with developers of widely used search engines -- including DIA-NN \citep{Demichev2020}, MaxQuant \citep{Cox2008}, and MSFragger \citep{Kong2017} -- to define standardized submission guidelines that ensure result files, quantification tables, and metadata are structured for immediate reuse. More broadly, the \texttt{nf-downloadstats} pipeline and DeepLogBot framework are applicable to any EBI open data repository facing similar challenges, including genomics (ENA/SRA), structural biology (PDB), and metabolomics (MetaboLights) resources. \end{sloppypar} % ====================================================================== \section{Conclusion} % ====================================================================== -We present the PRIDE Archive download tracking infrastructure and the first comprehensive analysis of download patterns from the PRIDE proteomics archive, covering 159 million records over five years. The infrastructure comprises \texttt{nf-downloadstats}, a scalable Nextflow pipeline for processing large-scale download logs, and DeepLogBot, a bot detection framework with two complementary algorithms achieving up to 0.775 macro F1. After removing 88.0\% of traffic identified as automated, we obtain reliable usage metrics for 19.1 million genuine downloads spanning 34,085 datasets. - -Our analysis reveals a globally distributed user base led by the United States, the United Kingdom, and Germany, a transition from FTP to HTTP-based access with emerging adoption of high-throughput protocols (Aspera, Globus), and a highly concentrated dataset reuse distribution. On average, any PRIDE dataset file has been downloaded at least 30 times from 2021 to 2025, and more than 96\% of the datasets in PRIDE have been downloaded at least once. - -A particularly noteworthy finding is the identification of 664 download hubs distributed across 58 countries, accounting for 18.0 million downloads (11.3\% of total traffic). These hubs represent research groups and institutions that systematically reanalyze public proteomics data - whether to complement their own in-house experiments or to build community-wide resources such as \texttt{quantms} \citep{Dai2024}, PeptideAtlas \citep{Desiere2006}, GPMDB \citep{Craig2004}, Scop3P \citep{Decoster2022}, and MatrisomeDB \citep{Shao2020}. The global distribution of these hubs reinforces the role of PRIDE as a centralized, standardized, and reliable repository for proteomics data worldwide: rather than requiring data to be replicated and stored across multiple national or regional archives, the community benefits from a single curated resource from which data can be accessed and reanalyzed anywhere in the world.These findings provide evidence for the growing impact of open proteomics data and offer actionable insights for repository development. - -The PRIDE team, through \texttt{pridepy} \citep{Kamatchinathan2025} and ongoing infrastructure development, will continue releasing tools and features that enable researchers to discover, query, and download result files - including protein and peptide identifications, quantification tables, and processed spectra - independently of the full raw dataset. This is particularly important for researchers in low- and middle-income countries, who, as our file type analysis shows, rely more heavily on processed results than on raw files. Beyond standard community file formats such as mzIdentML and mzTab, we will collaborate with developers of widely used search engines to improve the representation and standardization of result-level information deposited in PRIDE, ensuring that analysis outputs are structured for immediate reuse. - -The highly skewed reuse distribution - where the top 1\% of datasets account for 43.3\% of all downloads while half of all datasets collectively represent only 3.1\% - highlights the need for improved discoverability of valuable but underutilized datasets. To address this, PRIDE will invest in richer metadata annotation through SDRF sample descriptions \citep{Dai2021} for the most downloaded and community-relevant datasets, deploy quality control reports generated by tools such as pmultiqc \citep{Dai2024pmultiqc}, and develop recommendation systems that surface relevant datasets based on experimental similarity rather than popularity alone. These efforts aim to lower the barrier to finding and reusing the ``long tail'' of datasets that may be highly relevant to specific research questions but currently lack the visibility to attract broad download activity. - +We present the PRIDE database download tracking infrastructure, comprising \texttt{nf-downloadstats} and DeepLogBot, and the first comprehensive analysis of PRIDE data download statistics, processing 159 million records spanning 2021--2025. After removing 88.0\% of automated traffic, the remaining 19.1 million genuine downloads across 34,085 datasets and 213 countries reveal a globally distributed user base, a transition toward HTTP-based access with emerging high-throughput protocols, a highly concentrated download distribution (Gini = 0.84), and 664 institutional download hubs spanning 58 countries. These findings provide actionable insights for repository infrastructure planning, potential PRIDE mirrors placement, and demonstrate the value of bot-aware analytics for scientific data resources. -More broadly, the \texttt{nf-downloadstats} pipeline and DeepLogBot framework are freely available and applicable to any open data repository facing similar challenges, including genomics (ENA/SRA), structural biology (PDB), and metabolomics (MetaboLights) resources. +PRIDE have integrated download statistics into the PRIDE web interface, enabling data submitters to use these metrics in grant reports and publications. Through \texttt{pridepy} \citep{Kamatchinathan2025} and dedicated infrastructure for result-level data access, we aim to lower barriers for researchers (particularly in LMIC) to discover and download analysis outputs without reprocessing full raw datasets. The \texttt{nf-downloadstats} pipeline and DeepLogBot framework are freely available and applicable to any EBI open data repository facing similar challenges. \section*{Data and Code Availability} @@ -234,19 +221,19 @@ \section*{Data and Code Availability} The \texttt{nf-downloadstats} pipeline is available at \url{https://github.com/PRIDE-Archive/nf-downloadstats} and the DeepLogBot software at \url{https://github.com/ypriverol/deeplogbot}, both under the Apache 2.0 license. Download log data is available upon request from the PRIDE team. \end{sloppypar} -\section*{Funding} +\section*{Author contributions} -This work was supported by EMBL core funding; Wellcome Trust [208391/Z/17/Z, 223745/Z/21/Z]; Biotechnology and Biological Sciences Research Council [APP9749, BB/S01781X/1, BB/T019670/1, BB/V018779/1, BB/X001911/1, BB/V018779/1]. Funding for open access charge: Wellcome. +S.H. implemented the Nextflow workflow and collected the data; J.B. implemented the web interface for the download statistics components; C.B. and S.K. implemented the integration of the statistics components in the backend of PRIDE and databases; D.J.K., N.S.J., B.B-H., and N.M. contributed to reviewing the manuscript, the data generated, and curated some of the datasets; M.R.D. generated the infrastructure for log anonymization and provided the log files to the PRIDE team; J.A.V. reviewed the manuscript; Y.P-R. designed the study, developed the bot detection framework, performed the analysis, and wrote the manuscript. -\section*{Acknowledgements} +\section*{Acknowledgements -- Funding} -S.H. implemented the nextflow workflow; and the collected the data; J.B implemented web interface for the downloads components; C.B, S.K. implemented the integration of the statistics components in the backend of PRIDE and databases; D.J.K, N.S.J., B.B-H., N.M. contributed to review the manuscript; the data generated and curate some of the datasets; M.R.D. generated the infrastructure for logs anonimization and provided the logs files to PRIDE team, J.A.V. review the manuscript, Y.P-R design the study; developed the bot detection framework; performed the analysis and wrote the manuscript. We thank the PRIDE team for their support and feedback on the development of the download tracking infrastructure and analysis. We also wants to thanks professor Bernard Kuster for the original discussion about this topic in 2024 during the 2024 HUPO conference in Dresden +This work was supported by EMBL core funding; Wellcome [223745/Z/21/Z; 301300/Z/24/Z]; Biotechnology and Biological Sciences Research Council [BB/Y513829/1, BB/S01781X/1, BB/V018779/1, BB/X001911/1]; Engineering and Physical Sciences Research Council [EP/Y035984/1]; UK Research and Innovation [UKRI701]; US National Science Foundation [NSF/2324278]. We thank the PRIDE team for their support and feedback on the development of the download tracking infrastructure and analysis. We also thank Professor Bernhard Kuster for the original discussion about this topic in 2024 during the HUPO conference in Dresden. \section*{Conflict of Interest} The authors declare no conflict of interest. -\bibliographystyle{unsrtnat} +\bibliographystyle{proteomics} \bibliography{references} \end{document} diff --git a/paper/proteomics.bst b/paper/proteomics.bst new file mode 100644 index 0000000..ac403f2 --- /dev/null +++ b/paper/proteomics.bst @@ -0,0 +1,1547 @@ +%% +%% This is file `angew.bst', +%% generated with the docstrip utility. +%% +%% The original source files were: +%% +%% rsc.dtx (with options: `bst,angew') +%% ---------------------------------------------------------------- +%% rsc --- BibTeX styles for Royal Society of Chemistry and Wiley +%% journals +%% E-mail: joseph.wright@morningstar2.co.uk +%% Released under the LaTeX Project Public License v1.3c or later +%% See http://www.latex-project.org/lppl.txt +%% ---------------------------------------------------------------- +%% +ENTRY + { address + author + booktitle + chapter + ctrl-use-title + ctrl-etal-number + ctrl-link-doi + ctrl-use-doi-all + doi + edition + editor + howpublished + institution + journal + key + note + number + organization + pages + publisher + school + series + title + type + url + volume + year + } + {} + { label + extra.label + short.list + } + +INTEGERS { output.state before.all mid.sentence } +INTEGERS { after.sentence after.block after.item } +INTEGERS { author.or.editor } + +FUNCTION {init.state.consts} +{ #0 'before.all := + #1 'mid.sentence := + #2 'after.sentence := + #3 'after.block := + #4 'after.item := +} + +INTEGERS { is.use.title etal.number use.doi.all link.doi } + +%% #0 turns off the display of the title for articles +%% #1 enables +FUNCTION {default.is.use.title} { #1 } + +%% The number of names that force "et al." to be used +FUNCTION {default.etal.number} { #4 } + +%% #0 turns off the display of the DOI for articles +%% #1 enables +FUNCTION {default.use.doi.all} { #1 } + +%% #0 turns off hyperlinks for DOI +%% #1 enables +FUNCTION {default.link.doi} { #1 } + +FUNCTION {add.comma} +{ ", " * } + +STRINGS { s t } + +FUNCTION {output.nonnull} +{ 's := + output.state mid.sentence = + { add.comma write$ } + { output.state after.block = + { add.comma write$ + newline$ + "\newblock " write$ + } + { output.state before.all = + 'write$ + { output.state after.item = + { " " * write$ } + { add.period$ " " * write$ } + if$ + } + if$ + } + if$ + mid.sentence 'output.state := + } + if$ + s +} + +FUNCTION {output} +{ duplicate$ empty$ + 'pop$ + 'output.nonnull + if$ +} + +FUNCTION {output.check} +{ 't := + duplicate$ empty$ + { pop$ "Empty " t * " in " * cite$ * warning$ } + 'output.nonnull + if$ +} + +INTEGERS { would.add.period.textlen } + +FUNCTION {would.add.period} +{ duplicate$ + add.period$ + text.length$ + 'would.add.period.textlen := + duplicate$ + text.length$ + would.add.period.textlen = + { #0 } + { #1 } + if$ +} + +FUNCTION {fin.entry} +{ would.add.period + { "\relax" * write$ newline$ + "\mciteBstWouldAddEndPuncttrue" write$ newline$ + "\mciteSetBstMidEndSepPunct{\mcitedefaultmidpunct}" + write$ newline$ + "{\mcitedefaultendpunct}{\mcitedefaultseppunct}\relax" + } + { "\relax" * write$ newline$ + "\mciteBstWouldAddEndPunctfalse" write$ newline$ + "\mciteSetBstMidEndSepPunct{\mcitedefaultmidpunct}" + write$ newline$ + "{}{\mcitedefaultseppunct}\relax" + } + if$ + write$ + newline$ + "\EndOfBibitem" write$ +} + +FUNCTION {not} +{ { #0 } + { #1 } + if$ +} + +FUNCTION {and} +{ 'skip$ + { pop$ #0 } + if$ +} + +FUNCTION {or} +{ { pop$ #1 } + 'skip$ + if$ +} + +FUNCTION {field.or.null} +{ duplicate$ empty$ + { pop$ "" } + 'skip$ + if$ +} + +FUNCTION {emphasize} +{ duplicate$ empty$ + { pop$ "" } + { "\emph{" swap$ * "}" * } + if$ +} + +FUNCTION {boldface} +{ duplicate$ empty$ + { pop$ "" } + { "\textbf{" swap$ * "}" * } + if$ +} + +FUNCTION {paren} +{ duplicate$ empty$ + { pop$ "" } + { "(" swap$ * ")" * } + if$ +} + +FUNCTION {bbl.and} +{ "and" } + +FUNCTION {bbl.chapter} +{ "Chapter" } + +FUNCTION {bbl.doi} +{ "DOI" } + +FUNCTION {bbl.editor} +{ "Ed.:" } + +FUNCTION {bbl.editors} +{ "Eds.:" } + +FUNCTION {bbl.edition} +{ "ed." } + +FUNCTION {bbl.etal} +{ "et~al." emphasize } + +FUNCTION {bbl.in} +{ "in" } + +FUNCTION {bbl.inpress} +{ "in press" } + +FUNCTION {bbl.msc} +{ "MSc thesis" } + +FUNCTION {bbl.page} +{ "p." } + +FUNCTION {bbl.pages} +{ "pp." } + +FUNCTION {bbl.phd} +{ "PhD thesis" } + +FUNCTION {bbl.submitted} +{ "submitted for publication" } + +FUNCTION {bbl.techreport} +{ "Technical Report" } + +FUNCTION {bbl.volume} +{ "Vol." } + +FUNCTION {bbl.first} +{ "1st" } + +FUNCTION {bbl.second} +{ "2nd" } + +FUNCTION {bbl.third} +{ "3rd" } + +FUNCTION {bbl.fourth} +{ "4th" } + +FUNCTION {bbl.fifth} +{ "5th" } + +FUNCTION {bbl.st} +{ "st" } + +FUNCTION {bbl.nd} +{ "nd" } + +FUNCTION {bbl.rd} +{ "rd" } + +FUNCTION {bbl.th} +{ "th" } + +FUNCTION {eng.ord} +{ duplicate$ "1" swap$ * + #-2 #1 substring$ "1" = + { bbl.th * } + { duplicate$ #-1 #1 substring$ + duplicate$ "1" = + { pop$ bbl.st * } + { duplicate$ "2" = + { pop$ bbl.nd * } + { "3" = + { bbl.rd * } + { bbl.th * } + if$ + } + if$ + } + if$ + } + if$ +} + +INTEGERS{ l } + +FUNCTION{string.length} +{ #1 'l := + { duplicate$ duplicate$ #1 l substring$ = not } + { l #1 + 'l := } + while$ + pop$ l +} + +STRINGS{replace find text} + +INTEGERS{find_length} + +FUNCTION{find.replace} +{ 'replace := + 'find := + 'text := + find string.length 'find_length := + "" + { text empty$ not } + { text #1 find_length substring$ find = + { replace * + text #1 find_length + global.max$ substring$ 'text := + } + { text #1 #1 substring$ * + text #2 global.max$ substring$ 'text := + } + if$ + } + while$ +} + +FUNCTION {chr.to.value} +{ chr.to.int$ #48 - + duplicate$ duplicate$ + #0 < swap$ #9 > or + { #48 + int.to.chr$ + " is not a number..." * + warning$ + pop$ #0 + } + {} + if$ +} + +FUNCTION{is.a.digit} +{ duplicate$ "" = + {pop$ #0} + {chr.to.int$ #48 - duplicate$ + #0 < swap$ #9 > or not} + if$ +} + +FUNCTION{is.a.number} +{ + { duplicate$ #1 #1 substring$ is.a.digit } + {#2 global.max$ substring$} + while$ + "" = +} + +FUNCTION {extract.num} +{ duplicate$ 't := + "" 's := + { t empty$ not } + { t #1 #1 substring$ + t #2 global.max$ substring$ 't := + duplicate$ is.a.number + { s swap$ * 's := } + { pop$ "" 't := } + if$ + } + while$ + s empty$ + 'skip$ + { pop$ s } + if$ +} + +FUNCTION {bibinfo.check} +{ swap$ + duplicate$ missing$ + { pop$ pop$ + "" + } + { duplicate$ empty$ + { + swap$ pop$ + } + { swap$ + pop$ + } + if$ + } + if$ +} + +FUNCTION {convert.edition} +{ extract.num "l" change.case$ 's := + s "first" = s "1" = or + { bbl.first 't := } + { s "second" = s "2" = or + { bbl.second 't := } + { s "third" = s "3" = or + { bbl.third 't := } + { s "fourth" = s "4" = or + { bbl.fourth 't := } + { s "fifth" = s "5" = or + { bbl.fifth 't := } + { s #1 #1 substring$ is.a.number + { s eng.ord 't := } + { edition 't := } + if$ + } + if$ + } + if$ + } + if$ + } + if$ + } + if$ + t +} + +FUNCTION {tie.or.space.connect} +{ duplicate$ text.length$ #3 < + { "~" } + { " " } + if$ + swap$ * * +} + +FUNCTION {space.connect} +{ " " swap$ * * } + +INTEGERS { nameptr namesleft numnames } + +FUNCTION {format.names} +{ 's := + #1 'nameptr := + s num.names$ 'numnames := + numnames 'namesleft := + numnames etal.number > etal.number #0 > and + { s #1 "{vv~}{ll},{~f.}{, jj}" format.name$ + #2 'nameptr := + { nameptr etal.number > not } + { ", " * + s nameptr "{vv~}{ll},{~f.}{, jj}" format.name$ * + nameptr #1 + 'nameptr := + } + while$ + bbl.etal space.connect + } + { + { namesleft #0 > } + { s nameptr "{vv~}{ll},{~f.}{, jj}" format.name$ 't := + nameptr #1 > + { namesleft #1 > + { add.comma t * } + { numnames #2 > + { "" * } + 'skip$ + if$ + t "others" = + { bbl.etal space.connect } + { add.comma t * } + if$ + } + if$ + } + 't + if$ + nameptr #1 + 'nameptr := + namesleft #1 - 'namesleft := + } + while$ + } + if$ +} + +FUNCTION {format.authors} +{ author empty$ + { "" } + { #1 'author.or.editor := + author format.names } + if$ +} + +FUNCTION {strip.comma} +{ duplicate$ + string.length 'find_length := + duplicate$ + find_length #1 - #1 substring$ + "," = + { #1 find_length #2 - substring$ + " " * + } + 'skip$ + if$ +} + +FUNCTION {format.editors} +{ editor empty$ + { "" } + { #2 'author.or.editor := + strip.comma + editor num.names$ #1 > + { bbl.editors } + { bbl.editor } + if$ + " " * + editor format.names * paren + } + if$ +} + +FUNCTION {format.doi} +{ use.doi.all + { doi empty$ + 'skip$ + { + link.doi + { + "\href{http://dx.doi.org/" + doi * + "}{" * + bbl.doi doi tie.or.space.connect * + "}" * + } + { bbl.doi doi tie.or.space.connect } + if$ + output + } + if$ + } + 'skip$ + if$ +} + +FUNCTION {n.separate.multi} +{ 't := + "" + #0 'numnames := + t text.length$ #4 > t is.a.number and + { + { t empty$ not } + { t #-1 #1 substring$ is.a.number + { numnames #1 + 'numnames := } + { #0 'numnames := } + if$ + t #-1 #1 substring$ swap$ * + t #-2 global.max$ substring$ 't := + numnames #4 = + { duplicate$ #1 #1 substring$ swap$ + #2 global.max$ substring$ + "\," swap$ * * + #1 'numnames := + } + 'skip$ + if$ + } + while$ + } + { t swap$ * } + if$ +} + +FUNCTION {format.bvolume} +{ volume empty$ + { "" } + { bbl.volume volume tie.or.space.connect } + if$ +} + +FUNCTION {format.cvolume} +{ volume empty$ + { "" } + { + bbl.volume volume tie.or.space.connect + series empty$ + 'skip$ + { " of " * series * } + if$ + } + if$ +} + +FUNCTION {format.title.noemph} +{ 't := + t empty$ + { "" } + { t } + if$ +} + +FUNCTION {format.title} +{ 't := + t empty$ + { "" } + { t emphasize } + if$ +} + +FUNCTION {format.url} +{ url empty$ + { "" } + { "\url{" url * "}" * } + if$ +} + +FUNCTION {format.title.vol} +{ 't := + t empty$ + { "" } + { t emphasize } + if$ + volume empty$ + 'skip$ + { format.bvolume emphasize + swap$ add.comma swap$ * + } + if$ + } + +FUNCTION {format.full.names} +{'s := + #1 'nameptr := + s num.names$ 'numnames := + numnames 'namesleft := + { namesleft #0 > } + { s nameptr + "{vv~}{ll}" format.name$ 't := + nameptr #1 > + { + namesleft #1 > + { ", " * t * } + { + numnames #2 > + { "," * } + 'skip$ + if$ + t "others" = + { bbl.etal * } + { bbl.and space.connect t space.connect } + if$ + } + if$ + } + 't + if$ + nameptr #1 + 'nameptr := + namesleft #1 - 'namesleft := + } + while$ +} + +FUNCTION {author.editor.full} +{ author empty$ + { editor empty$ + { "" } + { editor format.full.names } + if$ + } + { author format.full.names } + if$ +} + +FUNCTION {author.full} +{ author empty$ + { "" } + { author format.full.names } + if$ +} + +FUNCTION {editor.full} +{ editor empty$ + { "" } + { editor format.full.names } + if$ +} + +FUNCTION {make.full.names} +{ type$ "book" = + type$ "inbook" = + or + 'author.editor.full + { type$ "proceedings" = + 'editor.full + 'author.full + if$ + } + if$ +} + +FUNCTION {output.bibitem} { newline$ + "\bibitem[" write$ + label write$ + ")" make.full.names duplicate$ short.list = + { pop$ } + { * } + if$ + "]{" * write$ + cite$ write$ + "}" write$ + newline$ + "" + before.all 'output.state := +} + +FUNCTION {n.dashify} { 't := + "" + { t empty$ not } + { t #1 #1 substring$ "-" = + { t #1 #2 substring$ "--" = not + { "--" * + t #2 global.max$ substring$ 't := + } + { { t #1 #1 substring$ "-" = } + { "-" * + t #2 global.max$ substring$ 't := + } + while$ + } + if$ + } + { t #1 #1 substring$ * + t #2 global.max$ substring$ 't := + } + if$ + } + while$ +} + +FUNCTION {format.date} +{ year empty$ + { "" } + { year boldface } + if$ +} + +FUNCTION {format.bdate} +{ year empty$ + { "There's no year in " cite$ * warning$ } + { year boldface } + if$ +} + +FUNCTION {either.or.check} +{ empty$ + 'pop$ + { "Can't use both " swap$ * " fields in " + * cite$ * warning$ } + if$ +} + +FUNCTION {format.edition} +{ edition duplicate$ empty$ + 'skip$ + { convert.edition + bbl.edition bibinfo.check + " " * bbl.edition * + } + if$ +} + +INTEGERS { multiresult } + +FUNCTION {multi.page.check} +{ 't := + #0 'multiresult := + { multiresult not + t empty$ not + and + } + { t #1 #1 substring$ + duplicate$ "-" = + swap$ duplicate$ "," = + swap$ "+" = + or or + { #1 'multiresult := } + { t #2 global.max$ substring$ 't := } + if$ + } + while$ + multiresult +} + +FUNCTION {format.pages} +{ pages empty$ + { "" } + { pages multi.page.check + { bbl.pages pages n.dashify tie.or.space.connect } + { bbl.page pages tie.or.space.connect } + if$ + } + if$ +} + +FUNCTION {format.pages.required} +{ pages empty$ + { "" + "There are no page numbers for " cite$ * warning$ + output + } + { pages multi.page.check + { bbl.pages pages n.dashify tie.or.space.connect } + { bbl.page pages tie.or.space.connect } + if$ + } + if$ +} + +FUNCTION {format.pages.nopp} +{ pages empty$ + { "" + "There are no page numbers for " cite$ * warning$ + output + } + { pages multi.page.check + { pages n.dashify space.connect } + { pages space.connect } + if$ + } + if$ +} + +FUNCTION {format.pages.patent} +{ pages empty$ + { "There is no patent number for " cite$ * warning$ } + { pages multi.page.check + { pages n.dashify } + { pages } + if$ + } + if$ +} + +FUNCTION {format.vol.pages} +{ volume emphasize field.or.null + duplicate$ empty$ + { pop$ format.pages.required } + { add.comma pages n.dashify * } + if$ +} + +FUNCTION {format.chapter.pages} +{ chapter empty$ + 'format.pages + { type empty$ + { bbl.chapter } + { type "l" change.case$ } + if$ + chapter tie.or.space.connect + pages empty$ + 'skip$ + { add.comma format.pages * } + if$ + } + if$ +} + +FUNCTION {format.title.in} +{ 's := + after.item 'output.state := + s empty$ + { "" } + { editor empty$ + { bbl.in s format.title space.connect } + { bbl.in s format.title space.connect + add.comma format.editors * + } + if$ + } + if$ +} + +FUNCTION {format.title.vol.in} +{ 's := + after.item 'output.state := + s empty$ + { "" } + { editor empty$ + { bbl.in s format.title.vol space.connect } + { bbl.in s format.title.vol space.connect + add.comma format.editors * + } + if$ + } + if$ +} + +FUNCTION {format.pub.address} +{ publisher empty$ + { "" } + { address empty$ + { publisher } + { publisher add.comma address *} + if$ + } + if$ +} + +FUNCTION {empty.misc.check} +{ author empty$ title empty$ howpublished empty$ + year empty$ note empty$ url empty$ + and and and and and + { "all relevant fields are empty in " cite$ * warning$ } + 'skip$ + if$ +} + +FUNCTION {empty.doi.note} +{ doi empty$ note empty$ and + { "Need either a note or DOI for " cite$ * warning$ } + 'skip$ + if$ +} + +FUNCTION {format.thesis.type} +{ type empty$ + 'skip$ + { pop$ + type emphasize + } + if$ +} + +FUNCTION {article} +{ output.bibitem + format.authors "author" output.check + is.use.title + { title format.title.noemph "title" output.check } + 'skip$ + if$ + + journal "," "" find.replace emphasize + "journal" output.check + after.item 'output.state := + format.date "year" output.check + volume empty$ + { "" format.pages.nopp output } + { format.vol.pages output } + if$ + format.doi + fin.entry +} + +FUNCTION {book} +{ output.bibitem + author empty$ + { title format.title.vol "title" output.check + editor empty$ + { "Need either an author or editor for " + cite$ * warning$ } + { "" format.editors * "editor" output.check } + if$ + } + { format.authors output + "author and editor" editor either.or.check + title format.title.vol "title" output.check + } + if$ + format.pub.address "publisher" output.check + format.edition output + format.bdate "year" output.check + pages empty$ + 'skip$ + { format.pages output } + if$ + format.doi + fin.entry +} + +FUNCTION {booklet} +{ output.bibitem + format.authors output + title format.title "title" output.check + howpublished output + address output + format.date output + format.doi + fin.entry +} + +FUNCTION {inbook} +{ output.bibitem + author empty$ + { title format.title.vol "title" output.check + editor empty$ + { "Need at least an author or an editor for " + cite$ * warning$ } + { "" format.editors * "editor" output.check } + if$ + } + { format.authors output + title format.title.vol.in "title" output.check + } + if$ + format.pub.address "publisher" output.check + format.edition output + format.bdate "year" output.check + format.chapter.pages "chapter and pages" output.check + format.doi + fin.entry +} + +FUNCTION {incollection} +{ output.bibitem + author empty$ + { booktitle format.title.vol "booktitle" output.check + editor empty$ + { "Need at least an author or an editor for " + cite$ * warning$ } + { "" format.editors * "editor" output.check } + if$ + } + { format.authors output + booktitle format.title.vol.in "booktitle" output.check + } + if$ + format.pub.address "publisher" output.check + format.edition output + format.bdate "year" output.check + format.chapter.pages "chapter and pages" output.check + format.doi + fin.entry +} + +FUNCTION {inpress} +{ output.bibitem + format.authors "author" output.check + journal emphasize "journal" output.check + bbl.inpress output + format.doi + fin.entry +} + +FUNCTION {inproceedings} +{ output.bibitem + format.authors "author" output.check + booktitle format.title "booktitle" output.check + address output + format.date "year" output.check + pages empty$ + 'skip$ + { format.pages output } + if$ + format.doi + fin.entry +} + +FUNCTION {manual} +{ output.bibitem + author empty$ + { organization empty$ + 'skip$ + { organization output + address output + } + if$ + } + { format.authors output } + if$ + title format.title.noemph "title" output.check + author empty$ + { organization empty$ + { address output } + 'skip$ + if$ + } + { organization output + address output + } + if$ + format.edition output + format.date output + format.doi + fin.entry +} + +FUNCTION {mastersthesis} +{ output.bibitem + format.authors "author" output.check + bbl.msc format.thesis.type output + school "school" output.check + address output + format.date "year" output.check + format.doi + fin.entry +} + +FUNCTION {misc} +{ output.bibitem + format.authors output + title empty$ + 'skip$ + { title format.title output } + if$ + howpublished output + format.date output + format.url output + note output + format.doi + fin.entry + empty.misc.check +} + +FUNCTION {patent} +{ output.bibitem + organization empty$ + { format.authors "author and organization" output.check } + { author empty$ + { organization } + { format.authors organization paren space.connect } + if$ + "author and organization" output.check + } + if$ + journal emphasize "journal" output.check + format.pages.patent "pages" output.check + format.date "year" output.check + format.doi + fin.entry +} + +FUNCTION {phdthesis} +{ output.bibitem + format.authors "author" output.check + bbl.phd format.thesis.type output + school "school" output.check + address output + format.date "year" output.check + format.doi + fin.entry +} + +FUNCTION {proceedings} +{ output.bibitem + title format.title "title" output.check + address output + format.date "year" output.check + pages empty$ + 'skip$ + { format.pages output } + if$ + format.doi + fin.entry +} + +FUNCTION {techreport} +{ output.bibitem + format.authors "author" output.check + title format.title "title" output.check + institution + type empty$ + 'bbl.techreport + 'type + if$ + space.connect + number empty$ + { "t" change.case$ } + { number tie.or.space.connect } + if$ + output + format.pub.address output + format.date "year" output.check + format.doi + fin.entry +} + +FUNCTION {unpublished} +{ output.bibitem + format.authors "author" output.check + journal empty$ + 'skip$ + { journal emphasize "journal" output.check } + if$ + doi empty$ + { note output } + { + bbl.doi doi tie.or.space.connect output + } + if$ + fin.entry + empty.doi.note +} + +INTEGERS { a b } + +FUNCTION {mult} +{ 'a := + 'b := + b #0 < + {#-1 #0 b - 'b :=} + {#1} + if$ + #0 + {b #0 >} + { a + + b #1 - 'b := + } + while$ + swap$ + 'skip$ + {#0 swap$ -} + if$ +} + +FUNCTION {str.to.int.aux} +{ {duplicate$ empty$ not} + { swap$ #10 mult 'a := + duplicate$ #1 #1 substring$ + chr.to.value a + + swap$ + #2 global.max$ substring$ + } + while$ + pop$ +} + +FUNCTION {str.to.int} +{ duplicate$ #1 #1 substring$ "-" = + {#1 swap$ #2 global.max$ substring$} + {#0 swap$} + if$ + #0 swap$ str.to.int.aux + swap$ + {#0 swap$ -} + {} + if$ +} + +FUNCTION {yes.no.to.int} +{ "l" change.case$ duplicate$ + "yes" = + { pop$ #1 } + { duplicate$ "no" = + { pop$ #0 } + { "unknown Boolean " quote$ * swap$ * quote$ * + " in " * cite$ * warning$ + #0 + } + if$ + } + if$ +} + +FUNCTION {Control} +{ ctrl-use-title + empty$ + { skip$ } + { ctrl-use-title + yes.no.to.int + 'is.use.title := } + if$ + ctrl-etal-number + empty$ + { skip$ } + { ctrl-etal-number + str.to.int + 'etal.number := } + if$ + ctrl-use-doi-all + empty$ + { skip$ } + { ctrl-use-doi-all + yes.no.to.int + 'use.doi.all := } + if$ + ctrl-link-doi + empty$ + { skip$ } + { ctrl-link-doi + yes.no.to.int + 'link.doi := } + if$ +} + +FUNCTION {conference} {inproceedings} + +FUNCTION {other} {patent} + +FUNCTION {default.type} {misc} + +MACRO {jan} {"January"} +MACRO {feb} {"February"} +MACRO {mar} {"March"} +MACRO {apr} {"April"} +MACRO {may} {"May"} +MACRO {jun} {"June"} +MACRO {jul} {"July"} +MACRO {aug} {"August"} +MACRO {sep} {"September"} +MACRO {oct} {"October"} +MACRO {nov} {"November"} +MACRO {dec} {"December"} + +READ + +FUNCTION {initialize.controls} +{ default.is.use.title 'is.use.title := + default.etal.number 'etal.number := + default.use.doi.all 'use.doi.all := + default.link.doi 'link.doi := +} + +INTEGERS { len } + +FUNCTION {chop.word} +{ 's := + 'len := + s #1 len substring$ = + { s len #1 + global.max$ substring$ } + 's + if$ +} + +FUNCTION {format.lab.names} +{ 's := + s #1 "{vv~}{ll}" format.name$ + s num.names$ duplicate$ + #2 > + { pop$ bbl.etal space.connect } + { #2 < + 'skip$ + { s #2 "{ff }{vv }{ll}{ jj}" format.name$ "others" = + { bbl.etal space.connect } + { bbl.and space.connect s #2 "{vv~}{ll}" + format.name$ space.connect } + if$ + } + if$ + } + if$ +} + +FUNCTION {author.key.label} +{ author empty$ + { key empty$ + { cite$ #1 #3 substring$ } + 'key + if$ + } + { author format.lab.names } + if$ +} + +FUNCTION {author.editor.key.label} +{ author empty$ + { editor empty$ + { key empty$ + { cite$ #1 #3 substring$ } + 'key + if$ + } + { editor format.lab.names } + if$ + } + { author format.lab.names } + if$ +} + +FUNCTION {author.key.organization.label} +{ author empty$ + { key empty$ + { organization empty$ + { cite$ #1 #3 substring$ } + { "The " #4 organization chop.word #3 + text.prefix$ } + if$ + } + 'key + if$ + } + { author format.lab.names } + if$ +} + +FUNCTION {editor.key.organization.label} +{ editor empty$ + { key empty$ + { organization empty$ + { cite$ #1 #3 substring$ } + { "The " #4 organization chop.word #3 + text.prefix$ } + if$ + } + 'key + if$ + } + { editor format.lab.names } + if$ +} + +FUNCTION {calc.short.authors} +{ type$ "book" = + type$ "inbook" = + or + 'author.editor.key.label + { type$ "proceedings" = + 'editor.key.organization.label + { type$ "manual" = + 'author.key.organization.label + 'author.key.label + if$ + } + if$ + } + if$ + 'short.list := +} + +FUNCTION {calc.label} +{ calc.short.authors + short.list + "(" + * + year duplicate$ empty$ + short.list key field.or.null = or + { pop$ "" } + 'skip$ + if$ + * + 'label := +} + +ITERATE {calc.label} + +STRINGS { longest.label last.label next.extra } + +INTEGERS { longest.label.width last.extra.num number.label } + +FUNCTION {initialize.longest.label} +{ "" 'longest.label := + #0 int.to.chr$ 'last.label := + "" 'next.extra := + #0 'longest.label.width := + #0 'last.extra.num := + #0 'number.label := +} + +FUNCTION {forward.pass} +{ last.label label = + { last.extra.num #1 + 'last.extra.num := + last.extra.num int.to.chr$ 'extra.label := + } + { "a" chr.to.int$ 'last.extra.num := + "" 'extra.label := + label 'last.label := + } + if$ + number.label #1 + 'number.label := +} + +EXECUTE {initialize.longest.label} + +ITERATE {forward.pass} + +FUNCTION {begin.bib} +{ + "rsc 2016/08/22 v3.1f" top$ + preamble$ empty$ + 'skip$ + { preamble$ write$ newline$ } + if$ + "\providecommand*{\mcitethebibliography}{\thebibliography}" + write$ newline$ + "\csname @ifundefined\endcsname{endmcitethebibliography}" + write$ newline$ + "{\let\endmcitethebibliography\endthebibliography}{}" + write$ newline$ + "\begin{mcitethebibliography}{" number.label int.to.str$ * "}" * + write$ newline$ + "\providecommand*{\natexlab}[1]{#1}" + write$ newline$ + "\providecommand*{\mciteSetBstSublistMode}[1]{}" + write$ newline$ + "\providecommand*{\mciteSetBstMaxWidthForm}[2]{}" + write$ newline$ + "\providecommand*{\mciteBstWouldAddEndPuncttrue}" + write$ newline$ + " {\def\EndOfBibitem{\unskip.}}" + write$ newline$ + "\providecommand*{\mciteBstWouldAddEndPunctfalse}" + write$ newline$ + " {\let\EndOfBibitem\relax}" + write$ newline$ + "\providecommand*{\mciteSetBstMidEndSepPunct}[3]{}" + write$ newline$ + "\providecommand*{\mciteSetBstSublistLabelBeginEnd}[3]{}" + write$ newline$ + "\providecommand*{\EndOfBibitem}{}" + write$ newline$ + "\mciteSetBstSublistMode{f}" + write$ newline$ + "\mciteSetBstMaxWidthForm{subitem}" + write$ newline$ + "{\alph{mcitesubitemcount})}" + write$ newline$ + "\mciteSetBstSublistLabelBeginEnd{\mcitemaxwidthsubitemform\space}" + write$ newline$ + "{\relax}{\relax}" + write$ newline$ +} + +EXECUTE {begin.bib} + +EXECUTE {init.state.consts} + +EXECUTE {initialize.controls} + +ITERATE {call.type$} + +FUNCTION {end.bib} +{ newline$ + "\end{mcitethebibliography}" write$ newline$ +} + +EXECUTE {end.bib} +%% +%% Copyright (C) 2006-2009,2011,2013,2016 by +%% Joseph Wright +%% +%% It may be distributed and/or modified under the conditions of +%% the LaTeX Project Public License (LPPL), either version 1.3c of +%% this license or (at your option) any later version. The latest +%% version of this license is in the file: +%% +%% http://www.latex-project.org/lppl.txt +%% +%% This work is "maintained" (as per LPPL maintenance status) by +%% Joseph Wright. +%% +%% This work consists of the file rsc.dtx +%% and the derived files rsc.pdf, +%% rsc.ins, +%% rsc.sty, +%% rsc.bib and +%% rsc-demo.tex. +%% +%% +%% +%% End of file `angew.bst'. diff --git a/paper/references.bib b/paper/references.bib index c8c51be..79fe23c 100644 --- a/paper/references.bib +++ b/paper/references.bib @@ -1,7 +1,18 @@ +@article{Deutsch2026, + title = {The {ProteomeXchange} consortium in 2026: making proteomics data {FAIR}}, + author = {Deutsch, Eric W. and Bandeira, Nuno and Perez-Riverol, Yasset and Sharma, Vagisha and Carver, Jeremy J. and Mendoza, Luis and Kundu, Deepti J. and Bandla, Chakradhar and Kamatchinathan, Selvakumar and Hewapathirana, Suresh and Sun, Zhi and Kawano, Shin and Okuda, Shujiro and Connolly, Brian and MacLean, Brendan and MacCoss, Michael J. and Chen, Tao and Zhu, Yunping and Ishihama, Yasushi and Vizca{\'\i}no, Juan Antonio}, + journal = {Nucleic Acids Res}, + volume = {54}, + number = {D1}, + pages = {D459--D469}, + year = {2026}, + doi = {10.1093/nar/gkaf1146} +} + @article{PerezRiverol2022reanalysis, title = {Proteomic repository data submission, dissemination, and reuse: key messages}, author = {Perez-Riverol, Yasset}, - journal = {Expert Review of Proteomics}, + journal = {Expert Rev Proteomics}, volume = {19}, number = {7-12}, pages = {297--310}, @@ -12,7 +23,7 @@ @article{PerezRiverol2022reanalysis @article{DiTommaso2017, title = {Nextflow enables reproducible computational workflows}, author = {Di Tommaso, Paolo and Chatzou, Maria and Floden, Evan W and Prieto Barja, Pablo and Palumbo, Emilio and Notredame, Cedric}, - journal = {Nature Biotechnology}, + journal = {Nat Biotechnol}, volume = {35}, number = {4}, pages = {316--319}, @@ -23,7 +34,7 @@ @article{DiTommaso2017 @article{Dai2021, title = {A proteomics sample metadata representation for multiomics integration and big data analysis}, author = {Dai, Chengxin and F{\"u}llgrabe, Anja and Pfeuffer, Julianus and Solovyeva, Elizaveta M and Deng, Jingwen and Moreno, Pablo and Kamatchinathan, Selvakumar and Kundu, Deepti Jaiswal and George, Nancy and Fexova, Silvie and others}, - journal = {Nature Communications}, + journal = {Nat Commun}, volume = {12}, number = {1}, pages = {5854}, @@ -34,7 +45,7 @@ @article{Dai2021 @article{PerezRiverol2019, title = {Quantifying the impact of public omics data}, author = {Perez-Riverol, Yasset and Zorin, Andrey and Dass, Gaurhari and Vu, Manh-Tu and Xu, Rui and Hermjakob, Henning and Vizcaíno, Juan Antonio}, - journal = {Nature Communications}, + journal = {Nat Commun}, volume = {10}, number = {1}, pages = {3512}, @@ -46,7 +57,7 @@ @article{PerezRiverol2019 @article{PerezRiverol2025, title = {The {PRIDE} database at 20 years: 2025 update}, author = {Perez-Riverol, Yasset and Bandla, Chakradhar and Kundu, Deepti J and Kamatchinathan, Selvakumar and Bai, Jingwen and Hewapathirana, Suresh and John, Nithu Sara and Riera Duocastella, Marc and Vibranovski, Maria D and Hermjakob, Henning and Vizcaíno, Juan Antonio}, - journal = {Nucleic Acids Research}, + journal = {Nucleic Acids Res}, volume = {53}, number = {D1}, pages = {D543--D553}, @@ -58,7 +69,7 @@ @article{PerezRiverol2025 @article{Kamatchinathan2025, title = {pridepy: A {P}ython Package to Download and Search Data from {PRIDE} Database}, author = {Kamatchinathan, Selvakumar and Hewapathirana, Suresh and Bandla, Chakradhar and Perez-Riverol, Yasset}, - journal = {Journal of Open Source Software}, + journal = {J Open Source Softw}, volume = {10}, number = {107}, pages = {7563}, @@ -69,7 +80,7 @@ @article{Kamatchinathan2025 @article{Dai2024, title = {quantms: a cloud-based pipeline for quantitative proteomics enables the reanalysis of public proteomics data}, author = {Dai, Chengxin and Pfeuffer, Julianus and Wang, Hong and Zheng, Ping and Käll, Lukas and Sachsenberg, Timo and Demichev, Vadim and Bai, Mingze and Kohlbacher, Oliver and Perez-Riverol, Yasset}, - journal = {Nature Methods}, + journal = {Nat Methods}, volume = {21}, number = {9}, pages = {1603--1607}, @@ -93,7 +104,7 @@ @article{Perez2019 @article{Perez-Riverol2022, title = {The {PRIDE} database resources in 2022: a hub for mass spectrometry-based proteomics evidences}, author = {Perez-Riverol, Yasset and Bai, Jingwen and Bandla, Chakradhar and Garc{\'\i}a-Seisdedos, David and Hewapathirana, Suresh and Kamatchinathan, Selvakumar and Kundu, Deepti J and Prakash, Ananth and Frericks-Zipper, Anika and Eisenacher, Martin and others}, - journal = {Nucleic Acids Research}, + journal = {Nucleic Acids Res}, volume = {50}, number = {D1}, pages = {D483--D490}, @@ -105,7 +116,7 @@ @article{Perez-Riverol2022 @article{Leinonen2011, title = {The {European Nucleotide Archive}}, author = {Leinonen, Rasko and Akhtar, Ruth and Birney, Ewan and Bower, Lawrence and Cerdeno-T{\'a}rraga, Ana and Cheng, Yuan and Cleland, Iain and Faruque, Nadeem and Goodgame, Neil and Gibson, Richard and others}, - journal = {Nucleic Acids Research}, + journal = {Nucleic Acids Res}, volume = {39}, number = {suppl\_1}, pages = {D28--D31}, @@ -117,7 +128,7 @@ @article{Leinonen2011 @article{UniProtConsortium2023, title = {{UniProt}: the universal protein knowledgebase in 2023}, author = {{The UniProt Consortium}}, - journal = {Nucleic Acids Research}, + journal = {Nucleic Acids Res}, volume = {51}, number = {D1}, pages = {D483--D489}, @@ -130,6 +141,7 @@ @techreport{Imperva2023 title = {Bad Bot Report 2023: The Account Takeover Edition}, author = {{Imperva}}, institution = {Imperva Inc.}, + address = {San Mateo, CA}, year = {2023}, url = {https://www.imperva.com/resources/reports/2023-bad-bot-report/}, note = {Annual analysis of automated bot traffic patterns across the internet} @@ -138,7 +150,7 @@ @techreport{Imperva2023 @article{Jonker2019, title = {Fingerprinting tooling used for {SSH} dictionary attack}, author = {Jonker, Mattijs and Stone-Gross, Brett and Plonka, David and Boehme, Alistair}, - journal = {Digital Investigation}, + journal = {Digit Investig}, volume = {31}, pages = {S138--S146}, year = {2019}, @@ -176,7 +188,7 @@ @inproceedings{Cabri2021 @article{Habibi2020, title = {Bot detection using {U}ser {A}gent-based fingerprinting}, author = {Habibi Lashkari, Arash and Kadir, Andi Fitriah Abdul and Gonzalez, Hugo and Mbah, Kenneth F and Ghorbani, Ali A}, - journal = {Computers \& Security}, + journal = {Comput Secur}, volume = {95}, pages = {101869}, year = {2020}, @@ -197,7 +209,7 @@ @inproceedings{Liu2008 @article{Breunig2000, title = {{LOF}: identifying density-based local outliers}, author = {Breunig, Markus M and Kriegel, Hans-Peter and Ng, Raymond T and Sander, J{\"o}rg}, - journal = {ACM SIGMOD Record}, + journal = {ACM SIGMOD Rec}, volume = {29}, number = {2}, pages = {93--104}, @@ -209,7 +221,7 @@ @article{Breunig2000 @article{Scholkopf2001, title = {Estimating the support of a high-dimensional distribution}, author = {Sch{\"o}lkopf, Bernhard and Platt, John C and Shawe-Taylor, John and Smola, Alex J and Williamson, Robert C}, - journal = {Neural Computation}, + journal = {Neural Comput}, volume = {13}, number = {7}, pages = {1443--1471}, @@ -229,7 +241,7 @@ @article{Vaswani2017 @article{Chandola2009, title = {Anomaly detection: A survey}, author = {Chandola, Varun and Banerjee, Arindam and Kumar, Vipin}, - journal = {ACM Computing Surveys}, + journal = {ACM Comput Surv}, volume = {41}, number = {3}, pages = {1--58}, @@ -249,7 +261,7 @@ @inproceedings{Devlin2019 @article{Pedregosa2011, title = {Scikit-learn: Machine learning in {P}ython}, author = {Pedregosa, Fabian and Varoquaux, Ga{\"e}l and Gramfort, Alexandre and Michel, Vincent and Thirion, Bertrand and Grisel, Olivier and Blondel, Mathieu and Prettenhofer, Peter and Weiss, Ron and Dubourg, Vincent and others}, - journal = {Journal of Machine Learning Research}, + journal = {J Mach Learn Res}, volume = {12}, pages = {2825--2830}, year = {2011} @@ -302,7 +314,7 @@ @inproceedings{Lundberg2017 @article{Wilkinson2016, title = {The {FAIR} Guiding Principles for scientific data management and stewardship}, author = {Wilkinson, Mark D and Dumontier, Michel and Aalbersberg, IJsbrand Jan and Appleton, Gabrielle and Axton, Myles and Baak, Arie and Blomberg, Niklas and Boiten, Jan-Willem and da Silva Santos, Luiz Bonino and Bourne, Philip E and others}, - journal = {Scientific Data}, + journal = {Sci Data}, volume = {3}, number = {1}, pages = {160018}, @@ -326,7 +338,7 @@ @article{Orr2025 @article{Demichev2020, author = {Demichev, Vadim and Messner, Christoph B. and Vernardis, Spyros I. and Lilley, Kathryn S. and Ralser, Markus}, title = {{DIA-NN}: neural networks and interference correction enable deep proteome coverage in high throughput}, - journal = {Nature Methods}, + journal = {Nat Methods}, year = {2020}, volume = {17}, number = {1}, @@ -337,7 +349,7 @@ @article{Demichev2020 @article{Cox2008, author = {Cox, J{\"u}rgen and Mann, Matthias}, title = {{MaxQuant} enables high peptide identification rates, individualized p.p.b.-range mass accuracies and proteome-wide protein quantification}, - journal = {Nature Biotechnology}, + journal = {Nat Biotechnol}, year = {2008}, volume = {26}, number = {12}, @@ -348,7 +360,7 @@ @article{Cox2008 @article{Kong2017, author = {Kong, Andy T. and Leprevost, Felipe V. and Avtonomov, Dmitry M. and Mellacheruvu, Dattatreya and Nesvizhskii, Alexey I.}, title = {{MSFragger}: ultrafast and comprehensive peptide identification in mass spectrometry-based proteomics}, - journal = {Nature Methods}, + journal = {Nat Methods}, year = {2017}, volume = {14}, pages = {513--520}, @@ -368,7 +380,7 @@ @inproceedings{Raasveldt2019 @article{Desiere2006, title = {The {PeptideAtlas} project}, author = {Desiere, Frank and Deutsch, Eric W and King, Nichole L and Nesvizhskii, Alexey I and Mallick, Parag and Eng, Jimmy and Chen, Sharon and Eddes, James and Loevenich, Sandra N and Aebersold, Ruedi}, - journal = {Nucleic Acids Research}, + journal = {Nucleic Acids Res}, volume = {34}, number = {suppl\_1}, pages = {D655--D658}, @@ -380,7 +392,7 @@ @article{Desiere2006 @article{Craig2004, title = {Open source system for analyzing, validating, and storing protein identification data}, author = {Craig, Robertson and Cortens, John P and Beavis, Ronald C}, - journal = {Journal of Proteome Research}, + journal = {J Proteome Res}, volume = {3}, number = {6}, pages = {1234--1242}, @@ -392,7 +404,7 @@ @article{Craig2004 @article{Decoster2022, title = {{Scop3P}: a comprehensive resource of human phosphosites within their full context}, author = {Decoster, Pathmanaban and Nkuipou-Kenfack, Eliane and Van Den Bossche, Tim and Menschaert, Gerben and Martens, Lennart and Gevaert, Kris and Coornaert, Bert and Versele, Mathieu and Ndah, Elvis and Costanzo, Michael C and others}, - journal = {Journal of Proteome Research}, + journal = {J Proteome Res}, volume = {22}, number = {1}, pages = {106--118}, @@ -404,7 +416,7 @@ @article{Decoster2022 @article{Shao2020, title = {{MatrisomeDB} 2.0: 2023 updates to the {ECM}-protein knowledge database}, author = {Shao, Xinhao and Gomez, Clarissa D and Kapoor, Nandini and Considine, James M and Grams, Christopher and Gao, Yu (Tom) and Naba, Alexandra}, - journal = {Nucleic Acids Research}, + journal = {Nucleic Acids Res}, volume = {51}, number = {D1}, pages = {D1519--D1530}, @@ -421,3 +433,11 @@ @article{Dai2024pmultiqc doi = {10.1101/2025.11.02.685980}, publisher = {Cold Spring Harbor Laboratory} } + +@misc{WellcomeLMIC, + title = {Low- and middle-income countries}, + author = {{Wellcome Trust}}, + year = {2025}, + howpublished = {\url{https://wellcome.org/research-funding/guidance/prepare-to-apply/low-and-middle-income-countries}}, + note = {Based on OECD DAC list. Accessed February 2026} +} diff --git a/paper/supplementary.pdf b/paper/supplementary.pdf index 877ff15..198ea22 100644 Binary files a/paper/supplementary.pdf and b/paper/supplementary.pdf differ diff --git a/paper/supplementary.tex b/paper/supplementary.tex index 83a3012..626bb07 100644 --- a/paper/supplementary.tex +++ b/paper/supplementary.tex @@ -31,12 +31,12 @@ } \title{\textbf{Supplementary Notes}\\[0.5em] -\large Tracking Dataset Reuse in Proteomics: A Comprehensive Analysis of PRIDE Archive Downloads} +\large Tracking dataset reuse in proteomics: a comprehensive analysis of PRIDE data download statistics} \author{ \small Suresh Hewapathirana, Jingwen Bai, Chakradhar Bandla, Selvakumar Kamatchinathan,\\ \small Deepti J Kundu, Nithu Sara John, Boma Brown-Harry, Nandana Madhusoodanan,\\ -\small Marc Riera Duocastella, Juan Antonio Vizca\'{i}no, Yasset Perez-Riverol +\small Joan Marc Riera Duocastella, Juan Antonio Vizca\'{i}no, Yasset Perez-Riverol } \date{} @@ -64,12 +64,12 @@ \subsection{Log Processing Workflow} \subsection{Data Scale and Coverage} -The processed dataset covers the period January 2020 through January 2025 (Figure~\ref{fig:pride_overview}). Key metrics include 47.35 million total file downloads across 32,106 distinct projects, 2.26 million unique files, and 807,156 unique users. The analyzed projects represent 96.4\% of all public PRIDE datasets, and 88.0\% of PRIDE files have been downloaded at least once. Downloads originate from 136 countries with more than 100 downloads each. +The processed Parquet file covers the period January 2021 through December 2025 (Figure~\ref{fig:pride_overview}). Key metrics include 159.3 million total file downloads across 35,528 distinct projects, 2.98 million unique files, and 9.80 million unique users. The analyzed projects represent essentially all public PRIDE datasets, and 91.7\% of PRIDE files have been downloaded at least once. Downloads originate from 194 countries with more than 100 downloads each. \begin{figure}[H] \centering \includegraphics[width=\textwidth]{figures/supp_pride_overview.png} -\caption{Overview of PRIDE download activity (2020--2025). Overall scale metrics, reuse intensity across projects/files/users, file coverage, and geographic reach.} +\caption{Overview of PRIDE download activity (2021--2025). Overall scale metrics, reuse intensity across projects/files/users, file coverage, and geographic reach.} \label{fig:pride_overview} \end{figure} @@ -375,7 +375,7 @@ \subsection{Inter-Method Agreement} \subsection{Classification Outcome Comparison} -On the 1M-record benchmark sample, the two methods produce different classification distributions (Figure~\ref{fig:method_comparison}). The Rules method classifies 29\% of locations as bots (72\% of downloads), while Deep classifies 34\% (77\% of downloads). +On the 1M-record benchmark sample, the two methods produce different classification distributions (Figure~\ref{fig:method_comparison}). The Rules method classifies 29\% of locations as bots (72\% of downloads), while Deep classifies 35\% (77\% of downloads). \begin{figure}[H] \centering @@ -451,7 +451,7 @@ \subsection{Regional Distribution} \begin{figure}[H] \centering \includegraphics[width=0.65\textwidth]{figures/figure1b_regional_distribution.pdf} -\caption{PRIDE downloads by world region (after bot removal, 2020--2025).} +\caption{PRIDE downloads by world region (after bot removal, 2021--2025).} \label{fig:regional_supp} \end{figure} @@ -492,7 +492,7 @@ \subsection{Top Downloaded Datasets} \subsection{Dataset Download Consistency} -The consistency heatmap (Figure~7B in the main text) shows that top datasets maintain sustained download activity across multiple years rather than one-time spikes. Beyond this, we rank datasets by a consistency score combining low coefficient of variation with high activity ratio (Figure~\ref{fig:consistency_scores}). PXD013868 achieves the highest consistency score (0.788), indicating steady, reliable reuse across the study period. +The consistency heatmap (Figure~5B in the main text) shows that top datasets maintain sustained download activity across multiple years rather than one-time spikes. Beyond this, we rank datasets by a consistency score combining low coefficient of variation with high activity ratio (Figure~\ref{fig:consistency_scores}). PXD013868 achieves the highest consistency score (0.788), indicating steady, reliable reuse across the study period. \begin{figure}[H] \centering @@ -523,17 +523,6 @@ \subsection{Country-Level Usage Intensity} \label{fig:bubble_chart} \end{figure} -\subsection{ProteomeXchange Resources} - -PRIDE hosts 83.2\% of all ProteomeXchange datasets, followed by MassIVE (6.9\%) and iProX (5.5\%) (Figure~\ref{fig:px_resources}). This dominance reflects PRIDE's position as the primary public repository for mass spectrometry proteomics data. - -\begin{figure}[H] -\centering -\includegraphics[width=0.5\textwidth]{figures/supp_px_resources.png} -\caption{Distribution of datasets across ProteomeXchange partner resources.} -\label{fig:px_resources} -\end{figure} - % ====================================================================== \section{S9. Limitations} \label{sec:limitations} @@ -541,7 +530,7 @@ \section{S9. Limitations} Our ground truth labels are heuristic-derived rather than manually verified, which may introduce systematic biases in the benchmark evaluation. The geographic attribution relies on IP geolocation, which can be inaccurate for users behind VPNs or institutional proxies. The 2025 data is from a partial year, making year-over-year comparisons with full years approximate. Finally, we cannot distinguish multiple individual users who share a geographic location from a single user, which may affect location-level statistics. -\bibliographystyle{unsrtnat} +\bibliographystyle{proteomics} \bibliography{references} \end{document} diff --git a/scripts/generate_figures.py b/scripts/generate_figures.py index 25e3b09..499f9ce 100644 --- a/scripts/generate_figures.py +++ b/scripts/generate_figures.py @@ -27,15 +27,15 @@ # Style settings for publication plt.rcParams.update({ - 'font.size': 10, + 'font.size': 12, 'font.family': 'sans-serif', - 'axes.labelsize': 11, - 'axes.titlesize': 12, - 'xtick.labelsize': 9, - 'ytick.labelsize': 9, - 'legend.fontsize': 9, - 'figure.dpi': 300, - 'savefig.dpi': 300, + 'axes.labelsize': 13, + 'axes.titlesize': 13, + 'xtick.labelsize': 11, + 'ytick.labelsize': 11, + 'legend.fontsize': 10, + 'figure.dpi': 600, + 'savefig.dpi': 600, 'savefig.bbox': 'tight', 'savefig.pad_inches': 0.1, }) @@ -53,13 +53,16 @@ 'Austria', 'Poland', 'Ireland', ] -# World Bank low/middle income countries present in PRIDE data -# China excluded here (upper-middle income, already dominant in panel A) +# Wellcome Trust / OECD DAC low- and middle-income countries present in PRIDE data +# (https://wellcome.org/research-funding/guidance/prepare-to-apply/low-and-middle-income-countries) +# China excluded here (already dominant in other panels) LMIC_COUNTRIES = [ - 'India', 'Brazil', 'Mexico', 'Indonesia', 'Thailand', - 'Colombia', 'Argentina', 'South Africa', 'Vietnam', 'Bangladesh', - 'Pakistan', 'Peru', 'Chile', 'Philippines', 'Nigeria', 'Egypt', - 'Kenya', 'Iran', 'Malaysia', 'Morocco', 'Turkey', 'Ukraine', + 'India', 'Russia', 'Mexico', 'Brazil', 'Turkey', 'Argentina', + 'Philippines', 'South Africa', 'Ukraine', 'Panama', 'Malaysia', + 'Algeria', 'Thailand', 'Indonesia', 'Bulgaria', 'Colombia', + 'Pakistan', 'Romania', 'Serbia', 'Cuba', 'Sri Lanka', 'Morocco', + 'Egypt', 'Vietnam', 'Bangladesh', 'Peru', 'Tunisia', 'Kenya', + 'Iran', 'Nigeria', ] # Color palette @@ -98,7 +101,7 @@ def figure_bot_detection_overview(output_dir): ax.set_title('(A) PRIDE Logs Workflow', fontsize=12, fontweight='bold', pad=10) # Style definitions - def draw_box(ax, x, y, w, h, text, color='#EBF5FB', edge='#2980B9', fontsize=8, bold=False): + def draw_box(ax, x, y, w, h, text, color='#EBF5FB', edge='#2980B9', fontsize=9, bold=False): box = FancyBboxPatch((x, y), w, h, boxstyle='round,pad=0.15', facecolor=color, edgecolor=edge, linewidth=1.5) ax.add_patch(box) @@ -118,28 +121,28 @@ def draw_arrow(ax, x1, y1, x2, y2, color='#2C3E50'): facecolor='none', edgecolor='#27AE60', linewidth=2.0, linestyle='--') ax.add_patch(nf_rect) - ax.text(0.3, 9.65, 'nf-downloadstats', fontsize=9, fontweight='bold', color='#27AE60', + ax.text(0.3, 9.65, 'nf-downloadstats', fontsize=10, fontweight='bold', color='#27AE60', fontstyle='italic') # Row 1: Data collection - draw_box(ax, 0.2, 8.3, 2.2, 1.0, 'PRIDE\nLog Files\n(TSV)', color='#FDEBD0', edge='#E67E22', fontsize=8, bold=True) + draw_box(ax, 0.2, 8.3, 2.2, 1.0, 'PRIDE\nLog Files\n(TSV)', color='#FDEBD0', edge='#E67E22', fontsize=9, bold=True) draw_arrow(ax, 2.4, 8.8, 3.0, 8.8) - draw_box(ax, 3.0, 8.3, 3.0, 1.0, 'Parse, Filter\n& Merge', color='#D5F5E3', edge='#27AE60', fontsize=8) + draw_box(ax, 3.0, 8.3, 3.0, 1.0, 'Parse, Filter\n& Merge', color='#D5F5E3', edge='#27AE60', fontsize=9) draw_arrow(ax, 6.0, 8.8, 6.6, 8.8) - draw_box(ax, 6.6, 8.3, 2.6, 1.0, 'Parquet\n159M records\n(4.7 GB)', color='#FDEBD0', edge='#E67E22', fontsize=8, bold=True) + draw_box(ax, 6.6, 8.3, 2.6, 1.0, 'Parquet\n159M records\n(4.7 GB)', color='#FDEBD0', edge='#E67E22', fontsize=9, bold=True) # --- Component 2: DeepLogBot (bottom section) --- lg_rect = mpatches.FancyBboxPatch((0.05, -0.15), 9.4, 7.55, boxstyle='round,pad=0.15', facecolor='none', edgecolor='#2980B9', linewidth=2.0, linestyle='--') ax.add_patch(lg_rect) - ax.text(0.3, 7.2, 'DeepLogBot', fontsize=9, fontweight='bold', color='#2980B9', + ax.text(0.3, 7.2, 'DeepLogBot', fontsize=10, fontweight='bold', color='#2980B9', fontstyle='italic') # Row 2: Location aggregation + Feature extraction - draw_box(ax, 0.3, 5.9, 4.2, 1.0, 'Location Aggregation\n47,987 geographic locations', color='#EBF5FB', edge='#2980B9', fontsize=8) + draw_box(ax, 0.3, 5.9, 4.2, 1.0, 'Location Aggregation\n47,987 geographic locations', color='#EBF5FB', edge='#2980B9', fontsize=9) draw_arrow(ax, 4.5, 6.4, 5.0, 6.4) - draw_box(ax, 5.0, 5.9, 4.2, 1.0, 'Feature Extraction\n60+ behavioral features\n(activity, temporal, discriminative)', color='#EBF5FB', edge='#2980B9', fontsize=7.5) + draw_box(ax, 5.0, 5.9, 4.2, 1.0, 'Feature Extraction\n60+ behavioral features\n(activity, temporal, discriminative)', color='#EBF5FB', edge='#2980B9', fontsize=8.5) # Arrows inside DeepLogBot from top to both boxes (no arrow from Parquet) midx = 4.75 # midpoint between the two boxes @@ -152,14 +155,14 @@ def draw_arrow(ax, x1, y1, x2, y2, color='#2C3E50'): draw_arrow(ax, 7.1, 5.9, 4.75, 5.4) # Row 3: Anomaly detection - draw_box(ax, 2.5, 4.3, 4.5, 0.9, 'Anomaly Detection\nIsolation Forest (contamination=15%)', color='#F5EEF8', edge='#8E44AD', fontsize=8) + draw_box(ax, 2.5, 4.3, 4.5, 0.9, 'Anomaly Detection\nIsolation Forest (contamination=15%)', color='#F5EEF8', edge='#8E44AD', fontsize=9) # Arrow down to methods draw_arrow(ax, 4.75, 4.3, 4.75, 3.8) # Row 4: Two classification methods side by side - draw_box(ax, 1.0, 2.7, 3.0, 0.9, 'Rule-Based\nYAML thresholds\n3-level hierarchy', color='#FADBD8', edge='#E74C3C', fontsize=7.5, bold=False) - draw_box(ax, 5.5, 2.7, 3.0, 0.9, 'Deep Architecture\n40+ extra features\n2-stage pipeline', color='#FADBD8', edge='#E74C3C', fontsize=7.5, bold=False) + draw_box(ax, 1.0, 2.7, 3.0, 0.9, 'Rule-Based\nYAML thresholds\n3-level hierarchy', color='#FADBD8', edge='#E74C3C', fontsize=8.5, bold=False) + draw_box(ax, 5.5, 2.7, 3.0, 0.9, 'Deep Architecture\n40+ extra features\n2-stage pipeline', color='#FADBD8', edge='#E74C3C', fontsize=8.5, bold=False) draw_arrow(ax, 4.75, 3.8, 2.5, 3.6) draw_arrow(ax, 4.75, 3.8, 7.0, 3.6) @@ -169,12 +172,12 @@ def draw_arrow(ax, x1, y1, x2, y2, color='#2C3E50'): draw_arrow(ax, 7.0, 2.7, 4.75, 2.2) # Row 5: Hierarchical classification output - draw_box(ax, 2.5, 1.1, 4.5, 0.9, 'Hierarchical Classification\nL1: Organic vs Automated\nL2: Bot vs Hub\nL3: Subcategory', color='#D4EFDF', edge='#27AE60', fontsize=7.5) + draw_box(ax, 2.5, 1.1, 4.5, 0.9, 'Hierarchical Classification\nL1: Organic vs Automated\nL2: Bot vs Hub\nL3: Subcategory', color='#D4EFDF', edge='#27AE60', fontsize=8.5) # Arrow to final output draw_arrow(ax, 4.75, 1.1, 4.75, 0.7) draw_box(ax, 1.5, 0.0, 6.5, 0.6, 'Bot-filtered dataset: 35.4M downloads, 34,908 datasets, 208 countries', - color='#D5F5E3', edge='#27AE60', fontsize=7.5, bold=True) + color='#D5F5E3', edge='#27AE60', fontsize=8.5, bold=True) # ---- Panel B: Classification distribution (download share bar chart) ---- ax2 = fig.add_subplot(gs[0, 1]) @@ -199,7 +202,7 @@ def draw_arrow(ax, x1, y1, x2, y2, color='#2C3E50'): pct = count / total_locs * 100 y_pos = max(bar.get_height() / 2, 5) ax2.text(bar.get_x() + bar.get_width() / 2, y_pos, - f'n={count:,}\n({pct:.1f}% locs)', ha='center', fontsize=7.5, + f'n={count:,}\n({pct:.1f}% locs)', ha='center', fontsize=9, color='white', fontweight='bold') plt.savefig(output_dir / 'figure_bot_detection_overview.pdf', format='pdf', bbox_inches='tight') @@ -286,26 +289,34 @@ def figure_2_temporal(output_dir): fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5)) # Panel A: Total downloads per year - ax1.bar(df['year'], df['total_downloads'] / 1e6, color='#3498DB', edgecolor='white', width=0.7) + years = df['year'].astype(int) + ax1.bar(years, df['total_downloads'] / 1e6, color='#3498DB', edgecolor='white', width=0.7) ax1.set_xlabel('Year') ax1.set_ylabel('Total Downloads (millions)') ax1.set_title('A) Annual Download Volume') ax1.spines['top'].set_visible(False) ax1.spines['right'].set_visible(False) + ax1.set_xticks(years) + ax1.set_xticklabels(years) + max_dl = df['total_downloads'].max() / 1e6 + ax1.set_ylim(0, max_dl * 1.25) for _, row in df.iterrows(): - ax1.text(row['year'], row['total_downloads'] / 1e6 + 0.5, - f"{row['total_downloads']/1e6:.1f}M", ha='center', fontsize=8) + ax1.text(int(row['year']), row['total_downloads'] / 1e6 + max_dl * 0.03, + f"{row['total_downloads']/1e6:.1f}M", ha='center', fontsize=11) # Panel B: Unique datasets and locations + years = df['year'].astype(int) ax2b = ax2.twinx() - l1 = ax2.plot(df['year'], df['unique_datasets'] / 1e3, 'o-', color='#E67E22', label='Unique datasets (k)') - l2 = ax2b.plot(df['year'], df['unique_locations'] / 1e3, 's--', color='#9B59B6', label='Unique locations (k)') + l1 = ax2.plot(years, df['unique_datasets'] / 1e3, 'o-', color='#E67E22', label='Unique datasets (k)') + l2 = ax2b.plot(years, df['unique_locations'] / 1e3, 's--', color='#9B59B6', label='Unique locations (k)') ax2.set_xlabel('Year') ax2.set_ylabel('Unique Datasets (thousands)', color='#E67E22') ax2b.set_ylabel('Unique Locations (thousands)', color='#9B59B6') ax2.set_title('B) Dataset and Location Growth') ax2.spines['top'].set_visible(False) + ax2.set_xticks(years) + ax2.set_xticklabels(years) lines = l1 + l2 labels = [l.get_label() for l in lines] @@ -556,14 +567,14 @@ def figure_5_concentration(output_dir): if top1_idx > 0: ax1.axvline(x=top1_idx, color='red', linestyle='--', alpha=0.7, linewidth=1) ax1.text(top1_idx * 1.3, downloads[0] * 0.5, 'Top 1%', - color='red', fontsize=9, fontweight='bold') + color='red', fontsize=11, fontweight='bold') # Annotate key statistics textstr = (f'Gini = {stats["gini_coefficient"]:.2f}\n' f'Top 1%: {stats["top_1pct_downloads_pct"]:.1f}% of DL\n' f'Top 10%: {stats["top_10pct_downloads_pct"]:.1f}% of DL\n' f'Median: {stats["median_downloads"]:,} DL') - ax1.text(0.95, 0.95, textstr, transform=ax1.transAxes, fontsize=9, + ax1.text(0.95, 0.95, textstr, transform=ax1.transAxes, fontsize=11, verticalalignment='top', horizontalalignment='right', bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.8)) @@ -574,15 +585,15 @@ def figure_5_concentration(output_dir): im = ax2.imshow(data, cmap='YlOrRd', aspect='auto', interpolation='nearest') ax2.set_xticks(range(len(heatmap_data.columns))) - ax2.set_xticklabels(heatmap_data.columns.astype(int), fontsize=9) + ax2.set_xticklabels(heatmap_data.columns.astype(int), fontsize=11) ax2.set_yticks(range(len(heatmap_data.index))) ylabels = [f'{acc} ({active_years[acc]}/{len(heatmap_data.columns)} yrs)' for acc in heatmap_data.index] - ax2.set_yticklabels(ylabels, fontsize=7) + ax2.set_yticklabels(ylabels, fontsize=8.5) ax2.set_xlabel('Year') cbar = plt.colorbar(im, ax=ax2, fraction=0.03, pad=0.04) - cbar.set_label('Downloads (log$_{10}$ scale)', fontsize=9) + cbar.set_label('Downloads (log$_{10}$ scale)', fontsize=11) # Annotate cells for i in range(data.shape[0]): @@ -597,7 +608,7 @@ def figure_5_concentration(output_dir): txt = str(val) color = 'white' if data[i, j] > 3.5 else 'black' ax2.text(j, i, txt, ha='center', va='center', - fontsize=5.5, color=color, fontweight='bold') + fontsize=7, color=color, fontweight='bold') ax2.set_title('(B) Top 25 Datasets: Download Consistency', fontsize=11, fontweight='bold', loc='left', pad=15) else: @@ -642,19 +653,19 @@ def figure_5_concentration(output_dir): x_fit = np.linspace(log_x.min(), log_x.max(), 100) ax3.plot(10**x_fit, 10**(slope * x_fit + intercept), 'r--', alpha=0.6, linewidth=1.5, zorder=4) ax3.text(0.05, 0.95, f'Spearman $\\rho$ = {rho:.3f}\np = {p_val:.2e}\nn = {len(df_cited)}', - transform=ax3.transAxes, fontsize=9, verticalalignment='top', + transform=ax3.transAxes, fontsize=11, verticalalignment='top', bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.8)) for _, row in df_cited.nlargest(5, 'total_downloads').iterrows(): ax3.annotate(row['accession'], (row['citation_count'], row['total_downloads']), - fontsize=6.5, alpha=0.8, xytext=(5, 5), textcoords='offset points') + fontsize=8.5, alpha=0.8, xytext=(5, 5), textcoords='offset points') ax3.set_xscale('log') ax3.set_yscale('log') ax3.set_xlabel('EuropePMC Reuse Citation Count') ax3.set_ylabel('Total Downloads (bot-filtered)') cb = plt.colorbar(sc, ax=ax3, shrink=0.8, pad=0.02) - cb.set_label('Download Consistency\n(years active / 5)', fontsize=9) + cb.set_label('Download Consistency\n(years active / 5)', fontsize=11) ax3.spines['top'].set_visible(False) ax3.spines['right'].set_visible(False) ax3.grid(True, alpha=0.3, which='both') @@ -873,7 +884,7 @@ def figure_7_bubble_chart(output_dir): for lx, ly in labeled: if abs(np.log10(x) - np.log10(lx)) < 0.15 and abs(np.log10(y) - np.log10(ly)) < 0.08: y_off += 8 - fontsize = 8 if row['total_downloads'] > 500000 else 7 + fontsize = 9.5 if row['total_downloads'] > 500000 else 8.5 fontweight = 'bold' if row['total_downloads'] > 1000000 else 'normal' ax_bubble.annotate( name, (x, y), @@ -893,7 +904,7 @@ def figure_7_bubble_chart(output_dir): ax_bubble.legend( legend_bubbles, [f'{v}' for v in legend_sizes], title='DL/User (size)', loc='upper left', - frameon=True, framealpha=0.9, fontsize=8, title_fontsize=8, + frameon=True, framealpha=0.9, fontsize=10, title_fontsize=10, labelspacing=1.5, borderpad=1.2, ) ax_bubble.spines['top'].set_visible(False) @@ -914,7 +925,7 @@ def figure_7_bubble_chart(output_dir): ax_europe.set_xlabel('Year') ax_europe.set_ylabel('Downloads (millions)') ax_europe.set_title('(B) European Countries', fontsize=11, fontweight='bold', loc='left') - ax_europe.legend(loc='upper left', fontsize=6.5, ncol=3, frameon=False) + ax_europe.legend(loc='upper left', fontsize=8, ncol=3, frameon=False) ax_europe.spines['top'].set_visible(False) ax_europe.spines['right'].set_visible(False) ax_europe.grid(True, alpha=0.2) @@ -937,7 +948,7 @@ def figure_7_bubble_chart(output_dir): ax_lmic.set_xlabel('Year') ax_lmic.set_ylabel('Downloads (thousands)') ax_lmic.set_title('(C) Low/Middle Income Countries', fontsize=11, fontweight='bold', loc='left') - ax_lmic.legend(loc='upper left', fontsize=6.5, ncol=3, frameon=False) + ax_lmic.legend(loc='upper left', fontsize=8, ncol=3, frameon=False) ax_lmic.spines['top'].set_visible(False) ax_lmic.spines['right'].set_visible(False) ax_lmic.grid(True, alpha=0.2) @@ -1082,7 +1093,7 @@ def figure_filetype_by_region(output_dir): region_totals = df.groupby('region')['downloads'].sum() df['pct'] = df.apply(lambda r: r['downloads'] / region_totals[r['region']] * 100, axis=1) - fig, axes = plt.subplots(1, 2, figsize=(14, 5.5), gridspec_kw={'width_ratios': [1.4, 1]}) + fig, axes = plt.subplots(1, 2, figsize=(15, 6), gridspec_kw={'width_ratios': [1.4, 1]}) # ---- Panel A: Grouped bar chart ---- ax = axes[0] @@ -1102,7 +1113,7 @@ def figure_filetype_by_region(output_dir): color=plt.cm.Set2(i / n_regions), edgecolor='white', linewidth=0.5) ax.set_xticks(x) - ax.set_xticklabels([cat_labels[c] for c in cat_order], fontsize=8) + ax.set_xticklabels([cat_labels[c] for c in cat_order], fontsize=10) ax.set_ylabel('Percentage of Downloads (%)') ax.legend(fontsize=9, frameon=False) ax.spines['top'].set_visible(False) @@ -1144,11 +1155,11 @@ def agg_type(cat): for k, (v, l) in enumerate(zip(vals, left)): if v > 5: ax2.text(l + v / 2, k, f'{v:.0f}%', ha='center', va='center', - fontsize=9, fontweight='bold', color='white') + fontsize=11, fontweight='bold', color='white') left += vals ax2.set_yticks(y_pos) - ax2.set_yticklabels([region_labels[r] for r in region_order], fontsize=10) + ax2.set_yticklabels([region_labels[r] for r in region_order], fontsize=11) ax2.set_xlabel('Percentage of Downloads (%)') ax2.legend(fontsize=8, loc='lower right', frameon=True, framealpha=0.9) ax2.spines['top'].set_visible(False) @@ -1182,7 +1193,7 @@ def figure_hub_distribution(output_dir): print(" SKIPPED - no hubs") return - fig = plt.figure(figsize=(16, 5.5)) + fig = plt.figure(figsize=(18, 6.5)) gs = gridspec.GridSpec(1, 3, width_ratios=[1.1, 1, 1], wspace=0.35) ax_map = fig.add_subplot(gs[0, 0]) ax_bar = fig.add_subplot(gs[0, 1]) @@ -1227,7 +1238,7 @@ def figure_hub_distribution(output_dir): s = np.clip(dl / dl_vals.max() * 300, 10, 300) ax_map.scatter([], [], s=s, c='#3498DB', alpha=0.6, edgecolors='navy', linewidth=0.4, label=label) - ax_map.legend(title='Downloads', loc='lower left', fontsize=7, title_fontsize=7, + ax_map.legend(title='Downloads', loc='lower left', fontsize=9, title_fontsize=9, frameon=True, framealpha=0.9, labelspacing=1.2) # ---- Panel B: Top 15 countries by hub count ---- @@ -1238,7 +1249,7 @@ def figure_hub_distribution(output_dir): bars = ax_bar.barh(range(len(country_counts)), country_counts.values, color=colors_bar, edgecolor='navy', linewidth=0.3, alpha=0.8) ax_bar.set_yticks(range(len(country_counts))) - ax_bar.set_yticklabels(country_counts.index, fontsize=8) + ax_bar.set_yticklabels(country_counts.index, fontsize=10) ax_bar.invert_yaxis() ax_bar.set_xlabel('Number of Hubs') ax_bar.spines['top'].set_visible(False) @@ -1249,7 +1260,7 @@ def figure_hub_distribution(output_dir): for i, (country, count) in enumerate(country_counts.items()): dl = country_downloads.get(country, 0) label = f'{dl/1e6:.1f}M' if dl >= 100000 else f'{dl/1e3:.0f}K' - ax_bar.text(count + 0.5, i, label, va='center', fontsize=7, color='gray') + ax_bar.text(count + 0.5, i, label, va='center', fontsize=9, color='gray') # ---- Panel C: Hub users vs downloads (log-log) ---- ax_scatter.scatter(hubs['unique_users'], hubs['total_downloads'], @@ -1266,9 +1277,11 @@ def figure_hub_distribution(output_dir): # Label top hubs top_hubs = hubs.nlargest(5, 'total_downloads') for _, row in top_hubs.iterrows(): - label = row['city'] if pd.notna(row['city']) and row['city'] else row['country'] + city = row['city'] if pd.notna(row['city']) and row['city'] else '' + country = row['country'] if pd.notna(row['country']) else '' + label = f"{city}, {country}" if city else country ax_scatter.annotate(label, (row['unique_users'], row['total_downloads']), - fontsize=7, fontweight='bold', + fontsize=9, fontweight='bold', xytext=(6, 4), textcoords='offset points') plt.savefig(output_dir / 'figure_hub_distribution.pdf', format='pdf', bbox_inches='tight') diff --git a/scripts/generate_pride_overview.py b/scripts/generate_pride_overview.py index 10b2acc..9a0be47 100644 --- a/scripts/generate_pride_overview.py +++ b/scripts/generate_pride_overview.py @@ -1,62 +1,162 @@ #!/usr/bin/env python3 """Generate the PRIDE overview supplementary figure (supp_pride_overview.png). -Creates a clean 2x3 panel figure without duplicate panels. +Computes all metrics directly from the Parquet data (2021-2025) +and PRIDE metadata files. No hardcoded numbers. + +Usage: + python scripts/generate_pride_overview.py """ +import os +import subprocess +from pathlib import Path + +import duckdb +import matplotlib +matplotlib.use("Agg") import matplotlib.pyplot as plt import matplotlib.ticker as ticker -import numpy as np -OUTPUT_PATH = "paper/figures/supp_pride_overview.png" +PROJECT_ROOT = Path(__file__).parent.parent +PARQUET_PATH = PROJECT_ROOT / "pride_data" / "data_downloads_parquet.parquet" +PROJECTS_JSON = PROJECT_ROOT / "pride_data" / "all_pride_projects.json" +FILES_JSON = PROJECT_ROOT / "pride_data" / "all_pride_files_metadata.json" +OUTPUT_PATH = PROJECT_ROOT / "paper" / "figures" / "supp_pride_overview.png" + +MIN_YEAR = 2021 -# Data from PRIDE download analysis (2020-2025) -SCALE = { - "Total Downloads": 47.35e6, - "Unique Files": 2.26e6, - "Unique Projects": 32.11e3, - "Unique Users": 807.16e3, -} -REUSE = { - "Downloads\nper Project": 1474.8, - "Downloads\nper File": 20.9, - "Downloads\nper User": 58.7, -} +def _duckdb_conn(): + conn = duckdb.connect() + conn.execute("PRAGMA memory_limit='3GB'") + tmp = os.path.abspath(str(PROJECT_ROOT / "duckdb-tmp")) + os.makedirs(tmp, exist_ok=True) + conn.execute(f"PRAGMA temp_directory='{tmp}'") + conn.execute("PRAGMA threads=2") + return conn -FILE_COVERAGE = 88.0 # % -PROJECT_COVERAGE = 96.4 # % -COUNTRIES = 136 + +def _grep_count(pattern: str, filepath: str) -> int: + """Count occurrences of a pattern in a file using grep -c.""" + result = subprocess.run( + ["grep", "-c", pattern, filepath], + capture_output=True, text=True, timeout=300, + ) + return int(result.stdout.strip()) if result.returncode == 0 else 0 + + +def compute_metrics(): + """Compute all overview metrics from the Parquet and metadata files.""" + p = str(PARQUET_PATH).replace("'", "''") + conn = _duckdb_conn() + + print("Computing metrics from Parquet (year >= 2021)...") + + row = conn.execute(f""" + SELECT + COUNT(*) AS total_downloads, + COUNT(DISTINCT accession) AS unique_projects, + COUNT(DISTINCT filename) AS unique_files, + COUNT(DISTINCT "user") AS unique_users, + MIN(year) AS min_year, + MAX(year) AS max_year + FROM read_parquet('{p}') + WHERE year >= {MIN_YEAR} + """).fetchone() + + total_dl, n_projects, n_files, n_users, min_yr, max_yr = row + + countries_row = conn.execute(f""" + SELECT COUNT(*) FROM ( + SELECT country + FROM read_parquet('{p}') + WHERE year >= {MIN_YEAR} + AND country IS NOT NULL AND country != '' + AND country NOT LIKE '%{{%' + GROUP BY country + HAVING COUNT(*) > 100 + ) + """).fetchone() + n_countries = countries_row[0] + + total_records = conn.execute(f""" + SELECT COUNT(*) FROM read_parquet('{p}') + """).fetchone()[0] + + conn.close() + + # Project coverage: parquet projects / total public PRIDE projects + print("Counting total public PRIDE projects...") + total_pride_projects = _grep_count('"accession" : "PXD', str(PROJECTS_JSON)) + + # File coverage: parquet files / total PRIDE files + print("Counting total PRIDE files...") + total_pride_files = _grep_count('"fileName"', str(FILES_JSON)) + + project_coverage = (n_projects / total_pride_projects * 100 + if total_pride_projects > 0 else 0) + file_coverage = (n_files / total_pride_files * 100 + if total_pride_files > 0 else 0) + + metrics = { + "total_downloads": total_dl, + "unique_projects": n_projects, + "unique_files": n_files, + "unique_users": n_users, + "dl_per_project": total_dl / max(n_projects, 1), + "dl_per_file": total_dl / max(n_files, 1), + "dl_per_user": total_dl / max(n_users, 1), + "n_countries": n_countries, + "project_coverage": round(project_coverage, 1), + "file_coverage": round(file_coverage, 1), + "total_records": total_records, + "min_year": min_yr, + "max_year": max_yr, + "total_pride_projects": total_pride_projects, + "total_pride_files": total_pride_files, + } + + print("\nComputed metrics:") + for k, v in metrics.items(): + print(f" {k}: {v:,.1f}" if isinstance(v, float) else f" {k}: {v:,}") + + return metrics def fmt_count(v): if v >= 1e6: - return f"{v/1e6:.2f} M" + return f"{v / 1e6:.2f} M" if v >= 1e3: - return f"{v/1e3:.2f} K" + return f"{v / 1e3:.2f} K" return f"{v:.0f}" -def main(): +def make_figure(m): + """Create the 2x3 panel overview figure.""" fig, axes = plt.subplots(2, 3, figsize=(14, 7.5)) fig.suptitle( - "PRIDE Download Activity Overview (2020\u20132025)", - fontsize=15, - fontweight="bold", - y=0.98, + f"PRIDE Download Activity Overview ({m['min_year']}\u2013{m['max_year']})", + fontsize=15, fontweight="bold", y=0.98, ) # --- (A) Overall Scale Metrics --- ax = axes[0, 0] - labels = list(SCALE.keys()) - values = list(SCALE.values()) + scale_labels = ["Total Downloads", "Unique Files", "Unique Projects", "Unique Users"] + scale_values = [ + m["total_downloads"], m["unique_files"], + m["unique_projects"], m["unique_users"], + ] colors = ["#e74c3c", "#3498db", "#2ecc71", "#f39c12"] - bars = ax.barh(labels, values, color=colors, edgecolor="white", height=0.6) - for bar, v in zip(bars, values): - ax.text(bar.get_width() + max(values) * 0.02, bar.get_y() + bar.get_height() / 2, + bars = ax.barh(scale_labels, scale_values, color=colors, + edgecolor="white", height=0.6) + for bar, v in zip(bars, scale_values): + ax.text(bar.get_width() + max(scale_values) * 0.02, + bar.get_y() + bar.get_height() / 2, fmt_count(v), va="center", fontsize=10, fontweight="bold") - ax.set_xlim(0, max(values) * 1.25) - ax.xaxis.set_major_formatter(ticker.FuncFormatter(lambda x, _: fmt_count(x))) + ax.set_xlim(0, max(scale_values) * 1.25) + ax.xaxis.set_major_formatter( + ticker.FuncFormatter(lambda x, _: fmt_count(x))) ax.set_title("Overall Scale", fontsize=12, fontweight="bold") ax.invert_yaxis() ax.spines["top"].set_visible(False) @@ -64,14 +164,18 @@ def main(): # --- (B) Reuse Intensity --- ax = axes[0, 1] - labels_r = list(REUSE.keys()) - values_r = list(REUSE.values()) - bars = ax.bar(labels_r, values_r, color=["#3498db", "#e67e22", "#2ecc71"], + reuse_labels = ["Downloads\nper Project", "Downloads\nper File", + "Downloads\nper User"] + reuse_values = [m["dl_per_project"], m["dl_per_file"], m["dl_per_user"]] + bars = ax.bar(reuse_labels, reuse_values, + color=["#3498db", "#e67e22", "#2ecc71"], edgecolor="white", width=0.6) - for bar, v in zip(bars, values_r): - ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + max(values_r) * 0.02, - f"{v:,.1f}", ha="center", va="bottom", fontsize=10, fontweight="bold") - ax.set_ylim(0, max(values_r) * 1.15) + for bar, v in zip(bars, reuse_values): + ax.text(bar.get_x() + bar.get_width() / 2, + bar.get_height() + max(reuse_values) * 0.02, + f"{v:,.1f}", ha="center", va="bottom", + fontsize=10, fontweight="bold") + ax.set_ylim(0, max(reuse_values) * 1.15) ax.set_title("Reuse Intensity", fontsize=12, fontweight="bold") ax.spines["top"].set_visible(False) ax.spines["right"].set_visible(False) @@ -79,56 +183,60 @@ def main(): # --- (C) Geographic Reach --- ax = axes[0, 2] - ax.text(0.5, 0.55, f"{COUNTRIES}", ha="center", va="center", + ax.text(0.5, 0.55, f"{m['n_countries']}", ha="center", va="center", fontsize=52, fontweight="bold", color="#2c3e50", transform=ax.transAxes) ax.text(0.5, 0.25, "Countries / Territories\n(>100 downloads each)", ha="center", va="center", fontsize=11, color="#7f8c8d", transform=ax.transAxes) - ax.set_xlim(0, 1) - ax.set_ylim(0, 1) + ax.set_xlim(0, 1); ax.set_ylim(0, 1) ax.axis("off") ax.set_title("Geographic Reach", fontsize=12, fontweight="bold") # --- (D) Project Coverage --- ax = axes[1, 0] - wedges, _ = ax.pie( - [PROJECT_COVERAGE, 100 - PROJECT_COVERAGE], - colors=["#3498db", "#ecf0f1"], - startangle=90, - wedgeprops=dict(width=0.35, edgecolor="white", linewidth=2), - ) - ax.text(0, 0, f"{PROJECT_COVERAGE}%", ha="center", va="center", + pc = m["project_coverage"] + ax.pie([pc, 100 - pc], colors=["#3498db", "#ecf0f1"], startangle=90, + wedgeprops=dict(width=0.35, edgecolor="white", linewidth=2)) + ax.text(0, 0, f"{pc}%", ha="center", va="center", fontsize=20, fontweight="bold", color="#2c3e50") - ax.set_title("Project Coverage\n(of public datasets)", fontsize=12, fontweight="bold") + ax.set_title("Project Coverage\n(of public datasets)", + fontsize=12, fontweight="bold") # --- (E) File Coverage --- ax = axes[1, 1] - wedges, _ = ax.pie( - [FILE_COVERAGE, 100 - FILE_COVERAGE], - colors=["#2ecc71", "#ecf0f1"], - startangle=90, - wedgeprops=dict(width=0.35, edgecolor="white", linewidth=2), - ) - ax.text(0, 0, f"{FILE_COVERAGE}%", ha="center", va="center", + fc = m["file_coverage"] + ax.pie([fc, 100 - fc], colors=["#2ecc71", "#ecf0f1"], startangle=90, + wedgeprops=dict(width=0.35, edgecolor="white", linewidth=2)) + ax.text(0, 0, f"{fc}%", ha="center", va="center", fontsize=20, fontweight="bold", color="#2c3e50") - ax.set_title("File Coverage\n(downloaded at least once)", fontsize=12, fontweight="bold") + ax.set_title("File Coverage\n(downloaded at least once)", + fontsize=12, fontweight="bold") # --- (F) Time period label --- ax = axes[1, 2] - ax.text(0.5, 0.55, "Jan 2020 \u2013 Jan 2025", ha="center", va="center", - fontsize=14, fontweight="bold", color="#2c3e50", - transform=ax.transAxes) - ax.text(0.5, 0.30, "5 years of download logs\n159.3 M raw records", + ax.text(0.5, 0.55, + f"Jan {m['min_year']} \u2013 Dec {m['max_year']}", + ha="center", va="center", fontsize=14, fontweight="bold", + color="#2c3e50", transform=ax.transAxes) + ax.text(0.5, 0.30, + f"5 years of download logs\n{m['total_records'] / 1e6:.1f} M raw records", ha="center", va="center", fontsize=11, color="#7f8c8d", transform=ax.transAxes) ax.axis("off") ax.set_title("Study Period", fontsize=12, fontweight="bold") plt.tight_layout(rect=[0, 0, 1, 0.94]) - fig.savefig(OUTPUT_PATH, dpi=200, bbox_inches="tight", facecolor="white") + os.makedirs(OUTPUT_PATH.parent, exist_ok=True) + fig.savefig(str(OUTPUT_PATH), dpi=200, bbox_inches="tight", + facecolor="white") plt.close() - print(f"Saved: {OUTPUT_PATH}") + print(f"\nSaved: {OUTPUT_PATH}") + + +def main(): + metrics = compute_metrics() + make_figure(metrics) if __name__ == "__main__": diff --git a/scripts/generate_supp_figures.py b/scripts/generate_supp_figures.py index ff9d629..b112fdd 100644 --- a/scripts/generate_supp_figures.py +++ b/scripts/generate_supp_figures.py @@ -351,7 +351,7 @@ def supp_fig_protocol_overall(conn, output_dir): df = conn.execute(f""" SELECT method as protocol, COUNT(*) as downloads FROM read_parquet('{p}') - WHERE year >= 2020 {FILT()} + WHERE year >= 2021 {FILT()} GROUP BY method ORDER BY downloads DESC """).df() @@ -443,7 +443,7 @@ def supp_fig_consistency_heatmap(conn, output_dir): yearly = conn.execute(f""" SELECT accession, year, COUNT(*) as downloads FROM read_parquet('{p}') - WHERE accession IN ({accessions_sql}) AND year >= 2020 {FILT()} + WHERE accession IN ({accessions_sql}) AND year >= 2021 {FILT()} GROUP BY accession, year ORDER BY accession, year """).df() diff --git a/scripts/run_full_analysis.py b/scripts/run_full_analysis.py index c617a5f..f8512c5 100644 --- a/scripts/run_full_analysis.py +++ b/scripts/run_full_analysis.py @@ -151,7 +151,7 @@ def analysis_2_temporal(conn, parquet_path, output_dir, has_filter=False): COUNT(DISTINCT accession) as unique_datasets, COUNT(DISTINCT geo_location) as unique_locations FROM read_parquet('{p}') - WHERE year >= 2020 AND year <= 2025 {filt} + WHERE year >= 2021 AND year <= 2025 {filt} GROUP BY year ORDER BY year """ @@ -165,7 +165,7 @@ def analysis_2_temporal(conn, parquet_path, output_dir, has_filter=False): month, COUNT(*) as total_downloads FROM read_parquet('{p}') - WHERE year >= 2020 AND year <= 2025 {filt} + WHERE year >= 2021 AND year <= 2025 {filt} GROUP BY year, month ORDER BY year, month """ @@ -192,7 +192,7 @@ def analysis_3_protocols(conn, parquet_path, output_dir, has_filter=False): year, COUNT(*) as downloads FROM read_parquet('{p}') - WHERE year >= 2020 {filt} + WHERE year >= 2021 {filt} GROUP BY method, year ORDER BY year, downloads DESC """ @@ -362,7 +362,7 @@ def analysis_6_hourly_patterns(conn, parquet_path, output_dir, has_filter=False) DAYOFWEEK(date) as day_of_week, COUNT(*) as downloads FROM read_parquet('{p}') - WHERE year >= 2020 {filt} + WHERE year >= 2021 {filt} GROUP BY hour, day_of_week ORDER BY hour, day_of_week """