diff --git a/scibot/extract.py b/scibot/extract.py index d380bde..2198fca 100644 --- a/scibot/extract.py +++ b/scibot/extract.py @@ -340,6 +340,12 @@ def find_rrids(text): for prefix, a, b, nums, suffix in matches4: yield prefix, f'RRID:SAMN{nums.strip()}', f'{a}{b}{nums}', suffix + # fifth round for NCBI Taxonomy + regex5 = r'(.{0,32})(NCBI)(:?\s?)(taxid|Taxid|taxId|TaxId|taxID|TaxID|txid|txId|txID|Txid|TxId|TxID|Taxon|Taxonomy)(_|:)(\s?)([0-9]{3,8})([^\w].{0,31})' + matches5 = re.findall(regex5, text) + for prefix, ncbi, delim1, tax, delim2, delim3, nums, suffix in matches5: + yield prefix, f'RRID:NCBITaxon_{nums.strip()}', f'{ncbi}{tax}{nums}', suffix + # extract from post