diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml new file mode 100644 index 0000000..db9f2d8 --- /dev/null +++ b/.github/workflows/run-tests.yml @@ -0,0 +1,37 @@ +name: Run pytests + +on: + push: + branches: [ "develop" ] + pull_request: + branches: [ "develop" ] + +permissions: + contents: read + +jobs: + build: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + - name: Set up Python 3.10 + uses: actions/setup-python@v3 + with: + python-version: "3.10" + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install flake8 pytest + if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + - name: Lint with flake8 + run: | + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: Test with pytest + run: | + pytest + diff --git a/README.md b/README.md index 7ff764f..adabe18 100644 --- a/README.md +++ b/README.md @@ -30,6 +30,11 @@ poetry shell poetry install ``` +* To update the darwin core terms supported in dwcahandler package +``` +poetry run update-dwc-terms +``` + ### Build To build dwcahandler package ``` @@ -62,4 +67,48 @@ ext_csvs = [CsvFileType(files=['/tmp/multimedia.csv'], type='multimedia')] DwcaHandler.create_dwca(core_csv=core_csv, ext_csv_list=ext_csvs, output_dwca_path='/tmp/dwca.zip') ``` +* Merge Darwin Core Archive +``` +from dwcahandler import DwcaHandler +DwcaHandler.merge_dwca(dwca_file='/tmp/dwca.zip', delta_dwca_file=/tmp/delta-dwca.zip, + output_dwca_path='/tmp/new-dwca.zip', + keys_lookup={'occurrence':'occurrenceID'}) +``` + +* Delete Rows from core file in Darwin Core Archive +``` +from dwcahandler import CsvFileType +from dwcahandler import DwcaHandler + +delete_csv = CsvFileType(files=['/tmp/old-records.csv'], type='occurrence', keys='occurrenceID') + +DwcaHandler.delete_records(dwca_file='/tmp/dwca.zip', + records_to_delete=delete_csv, + output_dwca_path='/tmp/new-dwca.zip') +``` + +* Other usages may include subclassing the dwca class, modifying the core dataframe content and rebuilding the dwca. +``` +from dwcahandler import Dwca + +class DerivedDwca(Dwca): + """ + Derived class to perform other custom operations that is not included as part of the core operations + """ + def _drop_columns(self): + """ + Drop existing column in the core content + """ + self.core_content.df_content.drop(columns=['column1', 'column2'], inplace=True) + self._update_meta_fields(self.core_content) + + +dwca = DerivedDwca(dwca_file_loc='/tmp/dwca.zip') +dwca._extract_dwca() +dwca._drop_columns() +dwca._generate_eml() +dwca._generate_meta() +dwca._write_dwca('/tmp/newdwca.zip') + +``` diff --git a/poetry.lock b/poetry.lock index d853931..0dc3413 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,161 @@ -# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand. + +[[package]] +name = "certifi" +version = "2023.11.17" +description = "Python package for providing Mozilla's CA Bundle." +optional = false +python-versions = ">=3.6" +files = [ + {file = "certifi-2023.11.17-py3-none-any.whl", hash = "sha256:e036ab49d5b79556f99cfc2d9320b34cfbe5be05c5871b51de9329f0603b0474"}, + {file = "certifi-2023.11.17.tar.gz", hash = "sha256:9b469f3a900bf28dc19b8cfbf8019bf47f7fdd1a65a1d4ffb98fc14166beb4d1"}, +] + +[[package]] +name = "charset-normalizer" +version = "3.3.2" +description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." +optional = false +python-versions = ">=3.7.0" +files = [ + {file = "charset-normalizer-3.3.2.tar.gz", hash = "sha256:f30c3cb33b24454a82faecaf01b19c18562b1e89558fb6c56de4d9118a032fd5"}, + {file = "charset_normalizer-3.3.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:25baf083bf6f6b341f4121c2f3c548875ee6f5339300e08be3f2b2ba1721cdd3"}, + {file = "charset_normalizer-3.3.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:06435b539f889b1f6f4ac1758871aae42dc3a8c0e24ac9e60c2384973ad73027"}, + {file = "charset_normalizer-3.3.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9063e24fdb1e498ab71cb7419e24622516c4a04476b17a2dab57e8baa30d6e03"}, + {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6897af51655e3691ff853668779c7bad41579facacf5fd7253b0133308cf000d"}, + {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1d3193f4a680c64b4b6a9115943538edb896edc190f0b222e73761716519268e"}, + {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cd70574b12bb8a4d2aaa0094515df2463cb429d8536cfb6c7ce983246983e5a6"}, + {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8465322196c8b4d7ab6d1e049e4c5cb460d0394da4a27d23cc242fbf0034b6b5"}, + {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a9a8e9031d613fd2009c182b69c7b2c1ef8239a0efb1df3f7c8da66d5dd3d537"}, + {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:beb58fe5cdb101e3a055192ac291b7a21e3b7ef4f67fa1d74e331a7f2124341c"}, + {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:e06ed3eb3218bc64786f7db41917d4e686cc4856944f53d5bdf83a6884432e12"}, + {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:2e81c7b9c8979ce92ed306c249d46894776a909505d8f5a4ba55b14206e3222f"}, + {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:572c3763a264ba47b3cf708a44ce965d98555f618ca42c926a9c1616d8f34269"}, + {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:fd1abc0d89e30cc4e02e4064dc67fcc51bd941eb395c502aac3ec19fab46b519"}, + {file = "charset_normalizer-3.3.2-cp310-cp310-win32.whl", hash = "sha256:3d47fa203a7bd9c5b6cee4736ee84ca03b8ef23193c0d1ca99b5089f72645c73"}, + {file = "charset_normalizer-3.3.2-cp310-cp310-win_amd64.whl", hash = "sha256:10955842570876604d404661fbccbc9c7e684caf432c09c715ec38fbae45ae09"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:802fe99cca7457642125a8a88a084cef28ff0cf9407060f7b93dca5aa25480db"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:573f6eac48f4769d667c4442081b1794f52919e7edada77495aaed9236d13a96"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:549a3a73da901d5bc3ce8d24e0600d1fa85524c10287f6004fbab87672bf3e1e"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f27273b60488abe721a075bcca6d7f3964f9f6f067c8c4c605743023d7d3944f"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1ceae2f17a9c33cb48e3263960dc5fc8005351ee19db217e9b1bb15d28c02574"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:65f6f63034100ead094b8744b3b97965785388f308a64cf8d7c34f2f2e5be0c4"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:753f10e867343b4511128c6ed8c82f7bec3bd026875576dfd88483c5c73b2fd8"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4a78b2b446bd7c934f5dcedc588903fb2f5eec172f3d29e52a9096a43722adfc"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e537484df0d8f426ce2afb2d0f8e1c3d0b114b83f8850e5f2fbea0e797bd82ae"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:eb6904c354526e758fda7167b33005998fb68c46fbc10e013ca97f21ca5c8887"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:deb6be0ac38ece9ba87dea880e438f25ca3eddfac8b002a2ec3d9183a454e8ae"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:4ab2fe47fae9e0f9dee8c04187ce5d09f48eabe611be8259444906793ab7cbce"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:80402cd6ee291dcb72644d6eac93785fe2c8b9cb30893c1af5b8fdd753b9d40f"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-win32.whl", hash = "sha256:7cd13a2e3ddeed6913a65e66e94b51d80a041145a026c27e6bb76c31a853c6ab"}, + {file = "charset_normalizer-3.3.2-cp311-cp311-win_amd64.whl", hash = "sha256:663946639d296df6a2bb2aa51b60a2454ca1cb29835324c640dafb5ff2131a77"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:0b2b64d2bb6d3fb9112bafa732def486049e63de9618b5843bcdd081d8144cd8"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:ddbb2551d7e0102e7252db79ba445cdab71b26640817ab1e3e3648dad515003b"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:55086ee1064215781fff39a1af09518bc9255b50d6333f2e4c74ca09fac6a8f6"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8f4a014bc36d3c57402e2977dada34f9c12300af536839dc38c0beab8878f38a"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a10af20b82360ab00827f916a6058451b723b4e65030c5a18577c8b2de5b3389"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8d756e44e94489e49571086ef83b2bb8ce311e730092d2c34ca8f7d925cb20aa"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:90d558489962fd4918143277a773316e56c72da56ec7aa3dc3dbbe20fdfed15b"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6ac7ffc7ad6d040517be39eb591cac5ff87416c2537df6ba3cba3bae290c0fed"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:7ed9e526742851e8d5cc9e6cf41427dfc6068d4f5a3bb03659444b4cabf6bc26"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:8bdb58ff7ba23002a4c5808d608e4e6c687175724f54a5dade5fa8c67b604e4d"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:6b3251890fff30ee142c44144871185dbe13b11bab478a88887a639655be1068"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:b4a23f61ce87adf89be746c8a8974fe1c823c891d8f86eb218bb957c924bb143"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:efcb3f6676480691518c177e3b465bcddf57cea040302f9f4e6e191af91174d4"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-win32.whl", hash = "sha256:d965bba47ddeec8cd560687584e88cf699fd28f192ceb452d1d7ee807c5597b7"}, + {file = "charset_normalizer-3.3.2-cp312-cp312-win_amd64.whl", hash = "sha256:96b02a3dc4381e5494fad39be677abcb5e6634bf7b4fa83a6dd3112607547001"}, + {file = "charset_normalizer-3.3.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:95f2a5796329323b8f0512e09dbb7a1860c46a39da62ecb2324f116fa8fdc85c"}, + {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c002b4ffc0be611f0d9da932eb0f704fe2602a9a949d1f738e4c34c75b0863d5"}, + {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a981a536974bbc7a512cf44ed14938cf01030a99e9b3a06dd59578882f06f985"}, + {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3287761bc4ee9e33561a7e058c72ac0938c4f57fe49a09eae428fd88aafe7bb6"}, + {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:42cb296636fcc8b0644486d15c12376cb9fa75443e00fb25de0b8602e64c1714"}, + {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0a55554a2fa0d408816b3b5cedf0045f4b8e1a6065aec45849de2d6f3f8e9786"}, + {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:c083af607d2515612056a31f0a8d9e0fcb5876b7bfc0abad3ecd275bc4ebc2d5"}, + {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:87d1351268731db79e0f8e745d92493ee2841c974128ef629dc518b937d9194c"}, + {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:bd8f7df7d12c2db9fab40bdd87a7c09b1530128315d047a086fa3ae3435cb3a8"}, + {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:c180f51afb394e165eafe4ac2936a14bee3eb10debc9d9e4db8958fe36afe711"}, + {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:8c622a5fe39a48f78944a87d4fb8a53ee07344641b0562c540d840748571b811"}, + {file = "charset_normalizer-3.3.2-cp37-cp37m-win32.whl", hash = "sha256:db364eca23f876da6f9e16c9da0df51aa4f104a972735574842618b8c6d999d4"}, + {file = "charset_normalizer-3.3.2-cp37-cp37m-win_amd64.whl", hash = "sha256:86216b5cee4b06df986d214f664305142d9c76df9b6512be2738aa72a2048f99"}, + {file = "charset_normalizer-3.3.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:6463effa3186ea09411d50efc7d85360b38d5f09b870c48e4600f63af490e56a"}, + {file = "charset_normalizer-3.3.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:6c4caeef8fa63d06bd437cd4bdcf3ffefe6738fb1b25951440d80dc7df8c03ac"}, + {file = "charset_normalizer-3.3.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:37e55c8e51c236f95b033f6fb391d7d7970ba5fe7ff453dad675e88cf303377a"}, + {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fb69256e180cb6c8a894fee62b3afebae785babc1ee98b81cdf68bbca1987f33"}, + {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ae5f4161f18c61806f411a13b0310bea87f987c7d2ecdbdaad0e94eb2e404238"}, + {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b2b0a0c0517616b6869869f8c581d4eb2dd83a4d79e0ebcb7d373ef9956aeb0a"}, + {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:45485e01ff4d3630ec0d9617310448a8702f70e9c01906b0d0118bdf9d124cf2"}, + {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:eb00ed941194665c332bf8e078baf037d6c35d7c4f3102ea2d4f16ca94a26dc8"}, + {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:2127566c664442652f024c837091890cb1942c30937add288223dc895793f898"}, + {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:a50aebfa173e157099939b17f18600f72f84eed3049e743b68ad15bd69b6bf99"}, + {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:4d0d1650369165a14e14e1e47b372cfcb31d6ab44e6e33cb2d4e57265290044d"}, + {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:923c0c831b7cfcb071580d3f46c4baf50f174be571576556269530f4bbd79d04"}, + {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:06a81e93cd441c56a9b65d8e1d043daeb97a3d0856d177d5c90ba85acb3db087"}, + {file = "charset_normalizer-3.3.2-cp38-cp38-win32.whl", hash = "sha256:6ef1d82a3af9d3eecdba2321dc1b3c238245d890843e040e41e470ffa64c3e25"}, + {file = "charset_normalizer-3.3.2-cp38-cp38-win_amd64.whl", hash = "sha256:eb8821e09e916165e160797a6c17edda0679379a4be5c716c260e836e122f54b"}, + {file = "charset_normalizer-3.3.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:c235ebd9baae02f1b77bcea61bce332cb4331dc3617d254df3323aa01ab47bd4"}, + {file = "charset_normalizer-3.3.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:5b4c145409bef602a690e7cfad0a15a55c13320ff7a3ad7ca59c13bb8ba4d45d"}, + {file = "charset_normalizer-3.3.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:68d1f8a9e9e37c1223b656399be5d6b448dea850bed7d0f87a8311f1ff3dabb0"}, + {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:22afcb9f253dac0696b5a4be4a1c0f8762f8239e21b99680099abd9b2b1b2269"}, + {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e27ad930a842b4c5eb8ac0016b0a54f5aebbe679340c26101df33424142c143c"}, + {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1f79682fbe303db92bc2b1136016a38a42e835d932bab5b3b1bfcfbf0640e519"}, + {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b261ccdec7821281dade748d088bb6e9b69e6d15b30652b74cbbac25e280b796"}, + {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:122c7fa62b130ed55f8f285bfd56d5f4b4a5b503609d181f9ad85e55c89f4185"}, + {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:d0eccceffcb53201b5bfebb52600a5fb483a20b61da9dbc885f8b103cbe7598c"}, + {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:9f96df6923e21816da7e0ad3fd47dd8f94b2a5ce594e00677c0013018b813458"}, + {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:7f04c839ed0b6b98b1a7501a002144b76c18fb1c1850c8b98d458ac269e26ed2"}, + {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:34d1c8da1e78d2e001f363791c98a272bb734000fcef47a491c1e3b0505657a8"}, + {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:ff8fa367d09b717b2a17a052544193ad76cd49979c805768879cb63d9ca50561"}, + {file = "charset_normalizer-3.3.2-cp39-cp39-win32.whl", hash = "sha256:aed38f6e4fb3f5d6bf81bfa990a07806be9d83cf7bacef998ab1a9bd660a581f"}, + {file = "charset_normalizer-3.3.2-cp39-cp39-win_amd64.whl", hash = "sha256:b01b88d45a6fcb69667cd6d2f7a9aeb4bf53760d7fc536bf679ec94fe9f3ff3d"}, + {file = "charset_normalizer-3.3.2-py3-none-any.whl", hash = "sha256:3e4d1f6587322d2788836a99c69062fbb091331ec940e02d12d179c1d53e25fc"}, +] + +[[package]] +name = "colorama" +version = "0.4.6" +description = "Cross-platform colored terminal text." +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" +files = [ + {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"}, + {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, +] + +[[package]] +name = "exceptiongroup" +version = "1.2.0" +description = "Backport of PEP 654 (exception groups)" +optional = false +python-versions = ">=3.7" +files = [ + {file = "exceptiongroup-1.2.0-py3-none-any.whl", hash = "sha256:4bfd3996ac73b41e9b9628b04e079f193850720ea5945fc96a08633c66912f14"}, + {file = "exceptiongroup-1.2.0.tar.gz", hash = "sha256:91f5c769735f051a4290d52edd0858999b57e5876e9f85937691bd4c9fa3ed68"}, +] + +[package.extras] +test = ["pytest (>=6)"] + +[[package]] +name = "idna" +version = "3.4" +description = "Internationalized Domain Names in Applications (IDNA)" +optional = false +python-versions = ">=3.5" +files = [ + {file = "idna-3.4-py3-none-any.whl", hash = "sha256:90b77e79eaa3eba6de819a0c442c0b4ceefc341a7a2ab77d7562bf49f425c5c2"}, + {file = "idna-3.4.tar.gz", hash = "sha256:814f528e8dead7d329833b91c5faa87d60bf71824cd12a7530b5526063d02cb4"}, +] + +[[package]] +name = "iniconfig" +version = "2.0.0" +description = "brain-dead simple config-ini parsing" +optional = false +python-versions = ">=3.7" +files = [ + {file = "iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374"}, + {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"}, +] [[package]] name = "numpy" @@ -41,6 +198,62 @@ files = [ {file = "numpy-1.26.1.tar.gz", hash = "sha256:c8c6c72d4a9f831f328efb1312642a1cafafaa88981d9ab76368d50d07d93cbe"}, ] +[[package]] +name = "numpy" +version = "1.26.2" +description = "Fundamental package for array computing in Python" +optional = false +python-versions = ">=3.9" +files = [ + {file = "numpy-1.26.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:3703fc9258a4a122d17043e57b35e5ef1c5a5837c3db8be396c82e04c1cf9b0f"}, + {file = "numpy-1.26.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:cc392fdcbd21d4be6ae1bb4475a03ce3b025cd49a9be5345d76d7585aea69440"}, + {file = "numpy-1.26.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:36340109af8da8805d8851ef1d74761b3b88e81a9bd80b290bbfed61bd2b4f75"}, + {file = "numpy-1.26.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bcc008217145b3d77abd3e4d5ef586e3bdfba8fe17940769f8aa09b99e856c00"}, + {file = "numpy-1.26.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:3ced40d4e9e18242f70dd02d739e44698df3dcb010d31f495ff00a31ef6014fe"}, + {file = "numpy-1.26.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:b272d4cecc32c9e19911891446b72e986157e6a1809b7b56518b4f3755267523"}, + {file = "numpy-1.26.2-cp310-cp310-win32.whl", hash = "sha256:22f8fc02fdbc829e7a8c578dd8d2e15a9074b630d4da29cda483337e300e3ee9"}, + {file = "numpy-1.26.2-cp310-cp310-win_amd64.whl", hash = "sha256:26c9d33f8e8b846d5a65dd068c14e04018d05533b348d9eaeef6c1bd787f9919"}, + {file = "numpy-1.26.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b96e7b9c624ef3ae2ae0e04fa9b460f6b9f17ad8b4bec6d7756510f1f6c0c841"}, + {file = "numpy-1.26.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:aa18428111fb9a591d7a9cc1b48150097ba6a7e8299fb56bdf574df650e7d1f1"}, + {file = "numpy-1.26.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:06fa1ed84aa60ea6ef9f91ba57b5ed963c3729534e6e54055fc151fad0423f0a"}, + {file = "numpy-1.26.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:96ca5482c3dbdd051bcd1fce8034603d6ebfc125a7bd59f55b40d8f5d246832b"}, + {file = "numpy-1.26.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:854ab91a2906ef29dc3925a064fcd365c7b4da743f84b123002f6139bcb3f8a7"}, + {file = "numpy-1.26.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:f43740ab089277d403aa07567be138fc2a89d4d9892d113b76153e0e412409f8"}, + {file = "numpy-1.26.2-cp311-cp311-win32.whl", hash = "sha256:a2bbc29fcb1771cd7b7425f98b05307776a6baf43035d3b80c4b0f29e9545186"}, + {file = "numpy-1.26.2-cp311-cp311-win_amd64.whl", hash = "sha256:2b3fca8a5b00184828d12b073af4d0fc5fdd94b1632c2477526f6bd7842d700d"}, + {file = "numpy-1.26.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:a4cd6ed4a339c21f1d1b0fdf13426cb3b284555c27ac2f156dfdaaa7e16bfab0"}, + {file = "numpy-1.26.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:5d5244aabd6ed7f312268b9247be47343a654ebea52a60f002dc70c769048e75"}, + {file = "numpy-1.26.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6a3cdb4d9c70e6b8c0814239ead47da00934666f668426fc6e94cce869e13fd7"}, + {file = "numpy-1.26.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aa317b2325f7aa0a9471663e6093c210cb2ae9c0ad824732b307d2c51983d5b6"}, + {file = "numpy-1.26.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:174a8880739c16c925799c018f3f55b8130c1f7c8e75ab0a6fa9d41cab092fd6"}, + {file = "numpy-1.26.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:f79b231bf5c16b1f39c7f4875e1ded36abee1591e98742b05d8a0fb55d8a3eec"}, + {file = "numpy-1.26.2-cp312-cp312-win32.whl", hash = "sha256:4a06263321dfd3598cacb252f51e521a8cb4b6df471bb12a7ee5cbab20ea9167"}, + {file = "numpy-1.26.2-cp312-cp312-win_amd64.whl", hash = "sha256:b04f5dc6b3efdaab541f7857351aac359e6ae3c126e2edb376929bd3b7f92d7e"}, + {file = "numpy-1.26.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:4eb8df4bf8d3d90d091e0146f6c28492b0be84da3e409ebef54349f71ed271ef"}, + {file = "numpy-1.26.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:1a13860fdcd95de7cf58bd6f8bc5a5ef81c0b0625eb2c9a783948847abbef2c2"}, + {file = "numpy-1.26.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:64308ebc366a8ed63fd0bf426b6a9468060962f1a4339ab1074c228fa6ade8e3"}, + {file = "numpy-1.26.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:baf8aab04a2c0e859da118f0b38617e5ee65d75b83795055fb66c0d5e9e9b818"}, + {file = "numpy-1.26.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:d73a3abcac238250091b11caef9ad12413dab01669511779bc9b29261dd50210"}, + {file = "numpy-1.26.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:b361d369fc7e5e1714cf827b731ca32bff8d411212fccd29ad98ad622449cc36"}, + {file = "numpy-1.26.2-cp39-cp39-win32.whl", hash = "sha256:bd3f0091e845164a20bd5a326860c840fe2af79fa12e0469a12768a3ec578d80"}, + {file = "numpy-1.26.2-cp39-cp39-win_amd64.whl", hash = "sha256:2beef57fb031dcc0dc8fa4fe297a742027b954949cabb52a2a376c144e5e6060"}, + {file = "numpy-1.26.2-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:1cc3d5029a30fb5f06704ad6b23b35e11309491c999838c31f124fee32107c79"}, + {file = "numpy-1.26.2-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:94cc3c222bb9fb5a12e334d0479b97bb2df446fbe622b470928f5284ffca3f8d"}, + {file = "numpy-1.26.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:fe6b44fb8fcdf7eda4ef4461b97b3f63c466b27ab151bec2366db8b197387841"}, + {file = "numpy-1.26.2.tar.gz", hash = "sha256:f65738447676ab5777f11e6bbbdb8ce11b785e105f690bc45966574816b6d3ea"}, +] + +[[package]] +name = "packaging" +version = "23.2" +description = "Core utilities for Python packages" +optional = false +python-versions = ">=3.7" +files = [ + {file = "packaging-23.2-py3-none-any.whl", hash = "sha256:8c491190033a9af7e1d931d0b5dacc2ef47509b34dd0de67ed209b5203fc88c7"}, + {file = "packaging-23.2.tar.gz", hash = "sha256:048fb0e9405036518eaaf48a55953c750c11e1a1b68e0dd1a9d62ed0c092cfc5"}, +] + [[package]] name = "pandas" version = "2.1.1" @@ -109,6 +322,43 @@ sql-other = ["SQLAlchemy (>=1.4.36)"] test = ["hypothesis (>=6.46.1)", "pytest (>=7.3.2)", "pytest-asyncio (>=0.17.0)", "pytest-xdist (>=2.2.0)"] xml = ["lxml (>=4.8.0)"] +[[package]] +name = "pluggy" +version = "1.3.0" +description = "plugin and hook calling mechanisms for python" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pluggy-1.3.0-py3-none-any.whl", hash = "sha256:d89c696a773f8bd377d18e5ecda92b7a3793cbe66c87060a6fb58c7b6e1061f7"}, + {file = "pluggy-1.3.0.tar.gz", hash = "sha256:cf61ae8f126ac6f7c451172cf30e3e43d3ca77615509771b3a984a0730651e12"}, +] + +[package.extras] +dev = ["pre-commit", "tox"] +testing = ["pytest", "pytest-benchmark"] + +[[package]] +name = "pytest" +version = "7.4.3" +description = "pytest: simple powerful testing with Python" +optional = false +python-versions = ">=3.7" +files = [ + {file = "pytest-7.4.3-py3-none-any.whl", hash = "sha256:0d009c083ea859a71b76adf7c1d502e4bc170b80a8ef002da5806527b9591fac"}, + {file = "pytest-7.4.3.tar.gz", hash = "sha256:d989d136982de4e3b29dabcc838ad581c64e8ed52c11fbe86ddebd9da0818cd5"}, +] + +[package.dependencies] +colorama = {version = "*", markers = "sys_platform == \"win32\""} +exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""} +iniconfig = "*" +packaging = "*" +pluggy = ">=0.12,<2.0" +tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""} + +[package.extras] +testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] + [[package]] name = "python-dateutil" version = "2.8.2" @@ -134,6 +384,27 @@ files = [ {file = "pytz-2023.3.post1.tar.gz", hash = "sha256:7b4fddbeb94a1eba4b557da24f19fdf9db575192544270a9101d8509f9f43d7b"}, ] +[[package]] +name = "requests" +version = "2.31.0" +description = "Python HTTP for Humans." +optional = false +python-versions = ">=3.7" +files = [ + {file = "requests-2.31.0-py3-none-any.whl", hash = "sha256:58cd2187c01e70e6e26505bca751777aa9f2ee0b7f4300988b709f44e013003f"}, + {file = "requests-2.31.0.tar.gz", hash = "sha256:942c5a758f98d790eaed1a29cb6eefc7ffb0d1cf7af05c3d2791656dbd6ad1e1"}, +] + +[package.dependencies] +certifi = ">=2017.4.17" +charset-normalizer = ">=2,<4" +idna = ">=2.5,<4" +urllib3 = ">=1.21.1,<3" + +[package.extras] +socks = ["PySocks (>=1.5.6,!=1.5.7)"] +use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"] + [[package]] name = "six" version = "1.16.0" @@ -145,6 +416,17 @@ files = [ {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, ] +[[package]] +name = "tomli" +version = "2.0.1" +description = "A lil' TOML parser" +optional = false +python-versions = ">=3.7" +files = [ + {file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"}, + {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"}, +] + [[package]] name = "tzdata" version = "2023.3" @@ -156,7 +438,23 @@ files = [ {file = "tzdata-2023.3.tar.gz", hash = "sha256:11ef1e08e54acb0d4f95bdb1be05da659673de4acbd21bf9c69e94cc5e907a3a"}, ] +[[package]] +name = "urllib3" +version = "2.1.0" +description = "HTTP library with thread-safe connection pooling, file post, and more." +optional = false +python-versions = ">=3.8" +files = [ + {file = "urllib3-2.1.0-py3-none-any.whl", hash = "sha256:55901e917a5896a349ff771be919f8bd99aff50b79fe58fec595eb37bbc56bb3"}, + {file = "urllib3-2.1.0.tar.gz", hash = "sha256:df7aa8afb0148fa78488e7899b2c59b5f4ffcfa82e6c54ccb9dd37c1d7b52d54"}, +] + +[package.extras] +brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)"] +socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"] +zstd = ["zstandard (>=0.18.0)"] + [metadata] lock-version = "2.0" -python-versions = ">=3.10,<3.13" -content-hash = "4b2c92db8159cd16f77d99f858f295f71c9736bc99846b94e2f40f47b8afbbdd" +python-versions = ">=3.9" +content-hash = "76959e6346a8ca6495e926d55182302e394ae843edeb7d4adccba85240ab6669" diff --git a/pyproject.toml b/pyproject.toml index edf5146..2e7c597 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "dwcahandler" -version = "0.0.1" +version = "0.0.2" description = "Python package to handle Darwin Core Archive (DwCA) operations. This includes creating a DwCA zip file from one or more csvs, reading a DwCA, merge two DwCAs, validate DwCA and delete records from DwCA based on one or more key columns" authors = ["Atlas of Living Australia data team "] maintainers = ["Atlas of Living Australia data team "] @@ -9,8 +9,14 @@ license = "MPL-1.1" readme = "README.md" [tool.poetry.dependencies] -python = ">=3.9,<3.13" -pandas = "^2.1.1" +python = ">=3.9" +pandas = "^2.1.0" +requests = "^2.31.0" +pytest = "^7.4.3" +pytest-mock = "^3.12.0" + +[tool.poetry.scripts] +update-dwc-terms = "dwcahandler.scripts.update_dwc_terms:update_terms" [build-system] requires = ["poetry-core"] diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..4b89a55 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +pandas==2.1.0 +requests==2.31.0 +pytest==7.4.3 +pytest-mock==3.12.0 diff --git a/src/dwcahandler/dwca/__init__.py b/src/dwcahandler/dwca/__init__.py index f2509ca..0f44955 100644 --- a/src/dwcahandler/dwca/__init__.py +++ b/src/dwcahandler/dwca/__init__.py @@ -137,6 +137,7 @@ def __str__(self) -> str: # Imports at end of file to allow classes to be used +from .terms import Terms from .dwca_meta import Element, MetaElementTypes, MetaElementInfo, MetaDwCA from .base_dwca import BaseDwca from .core_dwca import Dwca, DfContent diff --git a/src/dwcahandler/dwca/dwca_factory.py b/src/dwcahandler/dwca/dwca_factory.py index 46de357..7d5f06f 100644 --- a/src/dwcahandler/dwca/dwca_factory.py +++ b/src/dwcahandler/dwca/dwca_factory.py @@ -1,5 +1,7 @@ + from abc import ABCMeta, abstractmethod -from dwcahandler.dwca import CsvFileType, BaseDwca, Dwca, LargeDwca +import pandas as pd +from dwcahandler.dwca import CsvFileType, BaseDwca, Dwca, LargeDwca, Terms import logging from pathlib import Path @@ -141,6 +143,12 @@ def get_dwca_from_dwca_file(dwca_file: str, use_chunking: bool = False, work_dir class DwcaHandler: + + @staticmethod + def list_dwc_terms() -> pd.DataFrame: + return Terms().dwc_terms_df + + """Perform various DwCA operations""" @staticmethod diff --git a/src/dwcahandler/dwca/dwca_meta.py b/src/dwcahandler/dwca/dwca_meta.py index 645589e..91b9f9b 100644 --- a/src/dwcahandler/dwca/dwca_meta.py +++ b/src/dwcahandler/dwca/dwca_meta.py @@ -8,21 +8,16 @@ """ import xml.etree.ElementTree as ET from xml.dom import minidom -import pandas as pd -from dwcahandler.dwca import CSVEncoding, CoreOrExtType +from dwcahandler.dwca import CSVEncoding, CoreOrExtType, Terms import urllib -from pathlib import Path from dataclasses import dataclass, field, asdict from typing import ClassVar from typing import Optional -import os import re -this_dir, this_filename = os.path.split(__file__) - -@dataclass() -class Element(): +@dataclass +class Element: """A mapping of a name to a URI, giving the class of a row type""" name: str row_type_ns: str @@ -113,41 +108,30 @@ class Field: @dataclass -class MetaElementAttributes(): +class MetaElementAttributes: """A meta-description of a DwCA file""" meta_element_type: MetaElementInfo fields: list[Field] = field(default_factory=list) -def absolute_file_paths(directory): - """Convert files in a directory into absolute paths and return - as a a generator - - :param directory: The directory to scan. - :return: An absolute file path. - """ - for dirpath, _, filenames in os.walk(directory): - for f in filenames: - if re.fullmatch(r'.+\..*', f): - yield os.path.abspath(os.path.join(dirpath, f)) - - -@dataclass() +@dataclass class MetaDwCA: """Complete Metadata for a DwCA including dataset metadata and schema information""" EML_XML_FILENAME: str = field(default='eml.xml') dwca_meta: ET.Element = field(init=False) meta_elements: list[MetaElementAttributes] = field(default_factory=list, init=False) - TERMS: list[Path] = field(default_factory=lambda: [c for c in absolute_file_paths(f"{this_dir}/terms")], init=False) def __post_init__(self): - self.terms_df = pd.DataFrame() - for term in self.TERMS: + self.terms_df = Terms().terms_df + + """ + for term in self.TERMS_PATH: df = pd.read_csv(term, dtype='str') if not self.terms_df.empty: self.terms_df = self.terms_df.merge(df, how='outer', left_on=['term', 'uri'], right_on=['term', 'uri']) else: self.terms_df = df + """ # initialise own instance of meta content self.dwca_meta = ET.Element('archive') @@ -161,10 +145,10 @@ def extract_field_attr_value(field, attrib): meta_element_info = MetaElementInfo(core_or_ext_type=core_or_ext_type, type=MetaElementTypes.get_element_by_row_type(node_elm.attrib['rowType']), csv_encoding=CSVEncoding( - csv_delimiter=node_elm.attrib['fieldsTerminatedBy'], - csv_eol=node_elm.attrib['linesTerminatedBy'], - csv_text_enclosure=node_elm.attrib['fieldsEnclosedBy'] if - node_elm.attrib['fieldsEnclosedBy'] != '' else '"'), + csv_delimiter=node_elm.attrib['fieldsTerminatedBy'], + csv_eol=node_elm.attrib['linesTerminatedBy'], + csv_text_enclosure=node_elm.attrib['fieldsEnclosedBy'] if + node_elm.attrib['fieldsEnclosedBy'] != '' else '"'), ignore_header_lines=node_elm.attrib['ignoreHeaderLines'], charset_encoding=node_elm.attrib['encoding'], file_name=file_name) diff --git a/src/dwcahandler/dwca/terms.py b/src/dwcahandler/dwca/terms.py new file mode 100644 index 0000000..0d81805 --- /dev/null +++ b/src/dwcahandler/dwca/terms.py @@ -0,0 +1,61 @@ +import pandas as pd +from dataclasses import dataclass, field, asdict +import os +from pathlib import Path +import re + +this_dir, this_filename = os.path.split(__file__) + +def absolute_file_paths(directory): + """Convert files in a directory into absolute paths and return + as a generator + + :param directory: The directory to scan. + :return: An absolute file path. + """ + for dirpath, _, filenames in os.walk(directory): + for f in filenames: + if re.fullmatch(r'.+\..*', f): + yield os.path.abspath(os.path.join(dirpath, f)) + + +@dataclass +class Terms: + TERMS_DWC_URL = "https://raw.githubusercontent.com/tdwg/rs.tdwg.org/master/terms/terms.csv" + DWC_FILENAME = 'darwin-core-terms.csv' + DUBLIN_CORE_FILENAME = 'dublin-core-terms.csv' + TERMS_DIR = f"{this_dir}/terms" + DWC_FILE_PATH = f"{TERMS_DIR}/{DWC_FILENAME}" + DUBLIN_CORE_PATH = f"{TERMS_DIR}/{DUBLIN_CORE_FILENAME}" + + terms_path: list[Path] = field(default_factory=lambda: [c for c in absolute_file_paths(Terms.TERMS_DIR)], + init=False) + terms_df: pd.DataFrame = field(default_factory=pd.DataFrame, init=False) + dwc_terms_df: pd.DataFrame = field(default_factory=pd.DataFrame, init=False) + + def __post_init__(self): + def _add_to_df(existing_df: pd.DataFrame, df: pd.DataFrame): + if not existing_df.empty: + return existing_df.merge(df, how='outer', left_on=['term', 'uri'], right_on=['term', 'uri']) + return df + + for term_path in self.terms_path: + df = pd.read_csv(term_path, dtype='str') + self.terms_df = _add_to_df(self.terms_df, df) + if term_path == Terms.DWC_FILE_PATH or term_path == Terms.DUBLIN_CORE_PATH: + self.dwc_terms_df = _add_to_df(self.dwc_terms_df, df) + + @staticmethod + def update_dwc_terms(): + """ + Pull the latest terms from gbif dwc csv url and update the darwin core vocab terms in the package + This is still WIP, do we to pull the + :return: + """ + df = pd.read_csv(Terms.TERMS_DWC_URL, delimiter=",", encoding='utf-8', dtype='str') + df = df[df["term_deprecated"].isnull()] + dwc_df = pd.DataFrame() + dwc_df['term'] = df['term_localName'] + dwc_df['uri'] = df['term_isDefinedBy'] + df['term_localName'] + dwc_df.to_csv(Terms.DWC_FILE_PATH, index=False) + return dwc_df diff --git a/src/dwcahandler/dwca/terms/darwin_core-terms.csv b/src/dwcahandler/dwca/terms/darwin-core-terms.csv similarity index 75% rename from src/dwcahandler/dwca/terms/darwin_core-terms.csv rename to src/dwcahandler/dwca/terms/darwin-core-terms.csv index cdc5aab..e246f80 100644 --- a/src/dwcahandler/dwca/terms/darwin_core-terms.csv +++ b/src/dwcahandler/dwca/terms/darwin-core-terms.csv @@ -1,34 +1,16 @@ -term,uri -accessRights,http://purl.org/dc/terms/accessRights -bibliographicCitation,http://purl.org/dc/terms/bibliographicCitation -language,http://purl.org/dc/terms/language -license,http://purl.org/dc/terms/license -modified,http://purl.org/dc/terms/modified -references,http://purl.org/dc/terms/references -rights,http://purl.org/dc/terms/rights -rightsHolder,http://purl.org/dc/terms/rightsHolder -type,http://purl.org/dc/terms/type +term,uri acceptedNameUsage,http://rs.tdwg.org/dwc/terms/acceptedNameUsage acceptedNameUsageID,http://rs.tdwg.org/dwc/terms/acceptedNameUsageID -acceptedScientificName,http://rs.tdwg.org/dwc/terms/acceptedScientificName -acceptedScientificNameID,http://rs.tdwg.org/dwc/terms/acceptedScientificNameID -acceptedTaxonID,http://rs.tdwg.org/dwc/terms/acceptedTaxonID -acceptedTaxonName,http://rs.tdwg.org/dwc/terms/acceptedTaxonName -acceptedTaxonNameID,http://rs.tdwg.org/dwc/terms/acceptedTaxonNameID -accordingTo,http://rs.tdwg.org/dwc/terms/accordingTo -accuracy,http://rs.tdwg.org/dwc/terms/accuracy associatedMedia,http://rs.tdwg.org/dwc/terms/associatedMedia associatedOccurrences,http://rs.tdwg.org/dwc/terms/associatedOccurrences associatedOrganisms,http://rs.tdwg.org/dwc/terms/associatedOrganisms associatedReferences,http://rs.tdwg.org/dwc/terms/associatedReferences associatedSequences,http://rs.tdwg.org/dwc/terms/associatedSequences associatedTaxa,http://rs.tdwg.org/dwc/terms/associatedTaxa -basionym,http://rs.tdwg.org/dwc/terms/basionym -basionymID,http://rs.tdwg.org/dwc/terms/basionymID basisOfRecord,http://rs.tdwg.org/dwc/terms/basisOfRecord bed,http://rs.tdwg.org/dwc/terms/bed behavior,http://rs.tdwg.org/dwc/terms/behavior -binomial,http://rs.tdwg.org/dwc/terms/binomial +caste,http://rs.tdwg.org/dwc/terms/caste catalogNumber,http://rs.tdwg.org/dwc/terms/catalogNumber class,http://rs.tdwg.org/dwc/terms/class collectionCode,http://rs.tdwg.org/dwc/terms/collectionCode @@ -39,6 +21,7 @@ coordinateUncertaintyInMeters,http://rs.tdwg.org/dwc/terms/coordinateUncertainty country,http://rs.tdwg.org/dwc/terms/country countryCode,http://rs.tdwg.org/dwc/terms/countryCode county,http://rs.tdwg.org/dwc/terms/county +cultivarEpithet,http://rs.tdwg.org/dwc/terms/cultivarEpithet dataGeneralizations,http://rs.tdwg.org/dwc/terms/dataGeneralizations datasetID,http://rs.tdwg.org/dwc/terms/datasetID datasetName,http://rs.tdwg.org/dwc/terms/datasetName @@ -56,19 +39,12 @@ earliestEraOrLowestErathem,http://rs.tdwg.org/dwc/terms/earliestEraOrLowestErath earliestPeriodOrLowestSystem,http://rs.tdwg.org/dwc/terms/earliestPeriodOrLowestSystem endDayOfYear,http://rs.tdwg.org/dwc/terms/endDayOfYear establishmentMeans,http://rs.tdwg.org/dwc/terms/establishmentMeans -eventAttributes,http://rs.tdwg.org/dwc/terms/eventAttributes +Event,http://rs.tdwg.org/dwc/terms/Event eventDate,http://rs.tdwg.org/dwc/terms/eventDate eventID,http://rs.tdwg.org/dwc/terms/eventID -eventMeasurementAccuracy,http://rs.tdwg.org/dwc/terms/eventMeasurementAccuracy -eventMeasurementDeterminedBy,http://rs.tdwg.org/dwc/terms/eventMeasurementDeterminedBy -eventMeasurementDeterminedDate,http://rs.tdwg.org/dwc/terms/eventMeasurementDeterminedDate -eventMeasurementID,http://rs.tdwg.org/dwc/terms/eventMeasurementID -eventMeasurementRemarks,http://rs.tdwg.org/dwc/terms/eventMeasurementRemarks -eventMeasurementType,http://rs.tdwg.org/dwc/terms/eventMeasurementType -eventMeasurementUnit,http://rs.tdwg.org/dwc/terms/eventMeasurementUnit -eventMeasurementValue,http://rs.tdwg.org/dwc/terms/eventMeasurementValue eventRemarks,http://rs.tdwg.org/dwc/terms/eventRemarks eventTime,http://rs.tdwg.org/dwc/terms/eventTime +eventType,http://rs.tdwg.org/dwc/terms/eventType family,http://rs.tdwg.org/dwc/terms/family fieldNotes,http://rs.tdwg.org/dwc/terms/fieldNotes fieldNumber,http://rs.tdwg.org/dwc/terms/fieldNumber @@ -76,8 +52,11 @@ footprintSpatialFit,http://rs.tdwg.org/dwc/terms/footprintSpatialFit footprintSRS,http://rs.tdwg.org/dwc/terms/footprintSRS footprintWKT,http://rs.tdwg.org/dwc/terms/footprintWKT formation,http://rs.tdwg.org/dwc/terms/formation +FossilSpecimen,http://rs.tdwg.org/dwc/terms/FossilSpecimen +genericName,http://rs.tdwg.org/dwc/terms/genericName genus,http://rs.tdwg.org/dwc/terms/genus geodeticDatum,http://rs.tdwg.org/dwc/terms/geodeticDatum +GeologicalContext,http://rs.tdwg.org/dwc/terms/GeologicalContext geologicalContextID,http://rs.tdwg.org/dwc/terms/geologicalContextID georeferencedBy,http://rs.tdwg.org/dwc/terms/georeferencedBy georeferencedDate,http://rs.tdwg.org/dwc/terms/georeferencedDate @@ -90,11 +69,9 @@ habitat,http://rs.tdwg.org/dwc/terms/habitat higherClassification,http://rs.tdwg.org/dwc/terms/higherClassification higherGeography,http://rs.tdwg.org/dwc/terms/higherGeography higherGeographyID,http://rs.tdwg.org/dwc/terms/higherGeographyID -higherTaxonconceptID,http://rs.tdwg.org/dwc/terms/higherTaxonconceptID -higherTaxonName,http://rs.tdwg.org/dwc/terms/higherTaxonName -higherTaxonNameID,http://rs.tdwg.org/dwc/terms/higherTaxonNameID highestBiostratigraphicZone,http://rs.tdwg.org/dwc/terms/highestBiostratigraphicZone -identificationAttributes,http://rs.tdwg.org/dwc/terms/identificationAttributes +HumanObservation,http://rs.tdwg.org/dwc/terms/HumanObservation +Identification,http://rs.tdwg.org/dwc/terms/Identification identificationID,http://rs.tdwg.org/dwc/terms/identificationID identificationQualifier,http://rs.tdwg.org/dwc/terms/identificationQualifier identificationReferences,http://rs.tdwg.org/dwc/terms/identificationReferences @@ -103,8 +80,8 @@ identificationVerificationStatus,http://rs.tdwg.org/dwc/terms/identificationVeri identifiedBy,http://rs.tdwg.org/dwc/terms/identifiedBy identifiedByID,http://rs.tdwg.org/dwc/terms/identifiedByID individualCount,http://rs.tdwg.org/dwc/terms/individualCount -individualID,http://rs.tdwg.org/dwc/terms/individualID informationWithheld,http://rs.tdwg.org/dwc/terms/informationWithheld +infragenericEpithet,http://rs.tdwg.org/dwc/terms/infragenericEpithet infraspecificEpithet,http://rs.tdwg.org/dwc/terms/infraspecificEpithet institutionCode,http://rs.tdwg.org/dwc/terms/institutionCode institutionID,http://rs.tdwg.org/dwc/terms/institutionID @@ -118,53 +95,51 @@ latestEraOrHighestErathem,http://rs.tdwg.org/dwc/terms/latestEraOrHighestErathem latestPeriodOrHighestSystem,http://rs.tdwg.org/dwc/terms/latestPeriodOrHighestSystem lifeStage,http://rs.tdwg.org/dwc/terms/lifeStage lithostratigraphicTerms,http://rs.tdwg.org/dwc/terms/lithostratigraphicTerms +LivingSpecimen,http://rs.tdwg.org/dwc/terms/LivingSpecimen locality,http://rs.tdwg.org/dwc/terms/locality locationAccordingTo,http://rs.tdwg.org/dwc/terms/locationAccordingTo -locationAttributes,http://rs.tdwg.org/dwc/terms/locationAttributes locationID,http://rs.tdwg.org/dwc/terms/locationID locationRemarks,http://rs.tdwg.org/dwc/terms/locationRemarks lowestBiostratigraphicZone,http://rs.tdwg.org/dwc/terms/lowestBiostratigraphicZone +MachineObservation,http://rs.tdwg.org/dwc/terms/MachineObservation +MaterialCitation,http://rs.tdwg.org/dwc/terms/MaterialCitation +MaterialEntity,http://rs.tdwg.org/dwc/terms/MaterialEntity +materialEntityID,http://rs.tdwg.org/dwc/terms/materialEntityID +materialEntityRemarks,http://rs.tdwg.org/dwc/terms/materialEntityRemarks +MaterialSample,http://rs.tdwg.org/dwc/terms/MaterialSample materialSampleID,http://rs.tdwg.org/dwc/terms/materialSampleID maximumDepthInMeters,http://rs.tdwg.org/dwc/terms/maximumDepthInMeters -maximumDistanceAboveSurfaceIn,http://rs.tdwg.org/dwc/terms/maximumDistanceAboveSurfaceIn +maximumDistanceAboveSurfaceInMeters,http://rs.tdwg.org/dwc/terms/maximumDistanceAboveSurfaceInMeters maximumElevationInMeters,http://rs.tdwg.org/dwc/terms/maximumElevationInMeters measurementAccuracy,http://rs.tdwg.org/dwc/terms/measurementAccuracy measurementDeterminedBy,http://rs.tdwg.org/dwc/terms/measurementDeterminedBy measurementDeterminedDate,http://rs.tdwg.org/dwc/terms/measurementDeterminedDate measurementID,http://rs.tdwg.org/dwc/terms/measurementID measurementMethod,http://rs.tdwg.org/dwc/terms/measurementMethod +MeasurementOrFact,http://rs.tdwg.org/dwc/terms/MeasurementOrFact measurementRemarks,http://rs.tdwg.org/dwc/terms/measurementRemarks measurementType,http://rs.tdwg.org/dwc/terms/measurementType measurementUnit,http://rs.tdwg.org/dwc/terms/measurementUnit measurementValue,http://rs.tdwg.org/dwc/terms/measurementValue member,http://rs.tdwg.org/dwc/terms/member minimumDepthInMeters,http://rs.tdwg.org/dwc/terms/minimumDepthInMeters -minimumDistanceAboveSurfaceIn,http://rs.tdwg.org/dwc/terms/minimumDistanceAboveSurfaceIn +minimumDistanceAboveSurfaceInMeters,http://rs.tdwg.org/dwc/terms/minimumDistanceAboveSurfaceInMeters minimumElevationInMeters,http://rs.tdwg.org/dwc/terms/minimumElevationInMeters month,http://rs.tdwg.org/dwc/terms/month municipality,http://rs.tdwg.org/dwc/terms/municipality nameAccordingTo,http://rs.tdwg.org/dwc/terms/nameAccordingTo nameAccordingToID,http://rs.tdwg.org/dwc/terms/nameAccordingToID -namePublicationID,http://rs.tdwg.org/dwc/terms/namePublicationID namePublishedIn,http://rs.tdwg.org/dwc/terms/namePublishedIn namePublishedInID,http://rs.tdwg.org/dwc/terms/namePublishedInID namePublishedInYear,http://rs.tdwg.org/dwc/terms/namePublishedInYear nomenclaturalCode,http://rs.tdwg.org/dwc/terms/nomenclaturalCode nomenclaturalStatus,http://rs.tdwg.org/dwc/terms/nomenclaturalStatus -occurrenceAttributes,http://rs.tdwg.org/dwc/terms/occurrenceAttributes -occurrenceDetails,http://rs.tdwg.org/dwc/terms/occurrenceDetails +Occurrence,http://rs.tdwg.org/dwc/terms/Occurrence occurrenceID,http://rs.tdwg.org/dwc/terms/occurrenceID -occurrenceMeasurementAccuracy,http://rs.tdwg.org/dwc/terms/occurrenceMeasurementAccuracy -occurrenceMeasurementDetermine,http://rs.tdwg.org/dwc/terms/occurrenceMeasurementDetermine -occurrenceMeasurementDetermin,http://rs.tdwg.org/dwc/terms/occurrenceMeasurementDetermin -occurrenceMeasurementID,http://rs.tdwg.org/dwc/terms/occurrenceMeasurementID -occurrenceMeasurementRemarks,http://rs.tdwg.org/dwc/terms/occurrenceMeasurementRemarks -occurrenceMeasurementType,http://rs.tdwg.org/dwc/terms/occurrenceMeasurementType -occurrenceMeasurementUnit,http://rs.tdwg.org/dwc/terms/occurrenceMeasurementUnit -occurrenceMeasurementValue,http://rs.tdwg.org/dwc/terms/occurrenceMeasurementValue occurrenceRemarks,http://rs.tdwg.org/dwc/terms/occurrenceRemarks occurrenceStatus,http://rs.tdwg.org/dwc/terms/occurrenceStatus order,http://rs.tdwg.org/dwc/terms/order +Organism,http://rs.tdwg.org/dwc/terms/Organism organismID,http://rs.tdwg.org/dwc/terms/organismID organismName,http://rs.tdwg.org/dwc/terms/organismName organismQuantity,http://rs.tdwg.org/dwc/terms/organismQuantity @@ -176,24 +151,27 @@ originalNameUsageID,http://rs.tdwg.org/dwc/terms/originalNameUsageID otherCatalogNumbers,http://rs.tdwg.org/dwc/terms/otherCatalogNumbers ownerInstitutionCode,http://rs.tdwg.org/dwc/terms/ownerInstitutionCode parentEventID,http://rs.tdwg.org/dwc/terms/parentEventID +parentMeasurementID,http://rs.tdwg.org/dwc/terms/parentMeasurementID parentNameUsage,http://rs.tdwg.org/dwc/terms/parentNameUsage parentNameUsageID,http://rs.tdwg.org/dwc/terms/parentNameUsageID pathway,http://rs.tdwg.org/dwc/terms/pathway phylum,http://rs.tdwg.org/dwc/terms/phylum pointRadiusSpatialFit,http://rs.tdwg.org/dwc/terms/pointRadiusSpatialFit preparations,http://rs.tdwg.org/dwc/terms/preparations +PreservedSpecimen,http://rs.tdwg.org/dwc/terms/PreservedSpecimen previousIdentifications,http://rs.tdwg.org/dwc/terms/previousIdentifications recordedBy,http://rs.tdwg.org/dwc/terms/recordedBy recordedByID,http://rs.tdwg.org/dwc/terms/recordedByID recordNumber,http://rs.tdwg.org/dwc/terms/recordNumber relatedResourceID,http://rs.tdwg.org/dwc/terms/relatedResourceID -relatedResourceType,http://rs.tdwg.org/dwc/terms/relatedResourceType relationshipAccordingTo,http://rs.tdwg.org/dwc/terms/relationshipAccordingTo relationshipEstablishedDate,http://rs.tdwg.org/dwc/terms/relationshipEstablishedDate relationshipOfResource,http://rs.tdwg.org/dwc/terms/relationshipOfResource +relationshipOfResourceID,http://rs.tdwg.org/dwc/terms/relationshipOfResourceID relationshipRemarks,http://rs.tdwg.org/dwc/terms/relationshipRemarks reproductiveCondition,http://rs.tdwg.org/dwc/terms/reproductiveCondition resourceID,http://rs.tdwg.org/dwc/terms/resourceID +ResourceRelationship,http://rs.tdwg.org/dwc/terms/ResourceRelationship resourceRelationshipID,http://rs.tdwg.org/dwc/terms/resourceRelationshipID sampleSizeUnit,http://rs.tdwg.org/dwc/terms/sampleSizeUnit sampleSizeValue,http://rs.tdwg.org/dwc/terms/sampleSizeValue @@ -202,32 +180,36 @@ samplingProtocol,http://rs.tdwg.org/dwc/terms/samplingProtocol scientificName,http://rs.tdwg.org/dwc/terms/scientificName scientificNameAuthorship,http://rs.tdwg.org/dwc/terms/scientificNameAuthorship scientificNameID,http://rs.tdwg.org/dwc/terms/scientificNameID -scientificNameRank,http://rs.tdwg.org/dwc/terms/scientificNameRank sex,http://rs.tdwg.org/dwc/terms/sex specificEpithet,http://rs.tdwg.org/dwc/terms/specificEpithet startDayOfYear,http://rs.tdwg.org/dwc/terms/startDayOfYear stateProvince,http://rs.tdwg.org/dwc/terms/stateProvince +subfamily,http://rs.tdwg.org/dwc/terms/subfamily subgenus,http://rs.tdwg.org/dwc/terms/subgenus -taxonAccordingTo,http://rs.tdwg.org/dwc/terms/taxonAccordingTo -taxonAttributes,http://rs.tdwg.org/dwc/terms/taxonAttributes +subtribe,http://rs.tdwg.org/dwc/terms/subtribe +superfamily,http://rs.tdwg.org/dwc/terms/superfamily +Taxon,http://rs.tdwg.org/dwc/terms/Taxon taxonConceptID,http://rs.tdwg.org/dwc/terms/taxonConceptID taxonID,http://rs.tdwg.org/dwc/terms/taxonID -taxonNameID,http://rs.tdwg.org/dwc/terms/taxonNameID taxonomicStatus,http://rs.tdwg.org/dwc/terms/taxonomicStatus taxonRank,http://rs.tdwg.org/dwc/terms/taxonRank taxonRemarks,http://rs.tdwg.org/dwc/terms/taxonRemarks +tribe,http://rs.tdwg.org/dwc/terms/tribe typeStatus,http://rs.tdwg.org/dwc/terms/typeStatus verbatimCoordinates,http://rs.tdwg.org/dwc/terms/verbatimCoordinates verbatimCoordinateSystem,http://rs.tdwg.org/dwc/terms/verbatimCoordinateSystem verbatimDepth,http://rs.tdwg.org/dwc/terms/verbatimDepth verbatimElevation,http://rs.tdwg.org/dwc/terms/verbatimElevation verbatimEventDate,http://rs.tdwg.org/dwc/terms/verbatimEventDate +verbatimIdentification,http://rs.tdwg.org/dwc/terms/verbatimIdentification +verbatimLabel,http://rs.tdwg.org/dwc/terms/verbatimLabel verbatimLatitude,http://rs.tdwg.org/dwc/terms/verbatimLatitude verbatimLocality,http://rs.tdwg.org/dwc/terms/verbatimLocality verbatimLongitude,http://rs.tdwg.org/dwc/terms/verbatimLongitude -verbatimScientificNameRank,http://rs.tdwg.org/dwc/terms/verbatimScientificNameRank verbatimSRS,http://rs.tdwg.org/dwc/terms/verbatimSRS verbatimTaxonRank,http://rs.tdwg.org/dwc/terms/verbatimTaxonRank vernacularName,http://rs.tdwg.org/dwc/terms/vernacularName +verticalDatum,http://rs.tdwg.org/dwc/terms/verticalDatum +vitality,http://rs.tdwg.org/dwc/terms/vitality waterBody,http://rs.tdwg.org/dwc/terms/waterBody -year,http://rs.tdwg.org/dwc/terms/year \ No newline at end of file +year,http://rs.tdwg.org/dwc/terms/year diff --git a/src/dwcahandler/dwca/terms/dublin-core-terms.csv b/src/dwcahandler/dwca/terms/dublin-core-terms.csv new file mode 100644 index 0000000..fa9100a --- /dev/null +++ b/src/dwcahandler/dwca/terms/dublin-core-terms.csv @@ -0,0 +1,56 @@ +term,uri +abstract,http://purl.org/dc/terms/abstract +accessRights,http://purl.org/dc/terms/accessRights +accrualMethod,http://purl.org/dc/terms/accrualMethod +accrualPeriodicity,http://purl.org/dc/terms/accrualPeriodicity +accrualPolicy,http://purl.org/dc/terms/accrualPolicy +alternative,http://purl.org/dc/terms/alternative +audience,http://purl.org/dc/terms/audience +available,http://purl.org/dc/terms/available +bibliographicCitation,http://purl.org/dc/terms/bibliographicCitation +conformsTo,http://purl.org/dc/terms/conformsTo +contributor,http://purl.org/dc/terms/contributor +coverage,http://purl.org/dc/terms/coverage +created,http://purl.org/dc/terms/created +creator,http://purl.org/dc/terms/creator +date,http://purl.org/dc/terms/date +dateAccepted,http://purl.org/dc/terms/dateAccepted +dateCopyrighted,http://purl.org/dc/terms/dateCopyrighted +dateSubmitted,http://purl.org/dc/terms/dateSubmitted +description,http://purl.org/dc/terms/description +educationLevel,http://purl.org/dc/terms/educationLevel +extent,http://purl.org/dc/terms/extent +format,http://purl.org/dc/terms/format +hasFormat,http://purl.org/dc/terms/hasFormat +hasPart,http://purl.org/dc/terms/hasPart +hasVersion,http://purl.org/dc/terms/hasVersion +identifier,http://purl.org/dc/terms/identifier +instructionalMethod,http://purl.org/dc/terms/instructionalMethod +isFormatOf,http://purl.org/dc/terms/isFormatOf +isPartOf,http://purl.org/dc/terms/isPartOf +isReferencedBy,http://purl.org/dc/terms/isReferencedBy +isReplacedBy,http://purl.org/dc/terms/isReplacedBy +isRequiredBy,http://purl.org/dc/terms/isRequiredBy +issued,http://purl.org/dc/terms/issued +isVersionOf,http://purl.org/dc/terms/isVersionOf +language,http://purl.org/dc/terms/language +license,http://purl.org/dc/terms/license +mediator,http://purl.org/dc/terms/mediator +medium,http://purl.org/dc/terms/medium +modified,http://purl.org/dc/terms/modified +provenance,http://purl.org/dc/terms/provenance +publisher,http://purl.org/dc/terms/publisher +references,http://purl.org/dc/terms/references +relation,http://purl.org/dc/terms/relation +replaces,http://purl.org/dc/terms/replaces +requires,http://purl.org/dc/terms/requires +rights,http://purl.org/dc/terms/rights +rightsHolder,http://purl.org/dc/terms/rightsHolder +source,http://purl.org/dc/terms/source +spatial,http://purl.org/dc/terms/spatial +subject,http://purl.org/dc/terms/subject +tableOfContents,http://purl.org/dc/terms/tableOfContents +temporal,http://purl.org/dc/terms/temporal +title,http://purl.org/dc/terms/title +type,http://purl.org/dc/terms/type +valid,http://purl.org/dc/terms/valid \ No newline at end of file diff --git a/src/dwcahandler/scripts/update_dwc_terms.py b/src/dwcahandler/scripts/update_dwc_terms.py new file mode 100644 index 0000000..2f21b7c --- /dev/null +++ b/src/dwcahandler/scripts/update_dwc_terms.py @@ -0,0 +1,22 @@ +""" +A script called from cli for eg: + poetry run update-dwc-terms + +or github actions before doing the build +WIP: Need to automatically pull vocabulary version date from tdwg github + and find a way to update Readme if possible + Do we need to implement pulling a specific version of vocab?? Still need to decide +""" + +from dwcahandler.dwca.terms import Terms + +# Need to populate the Dwc term version programmatically +DWC_TERM_VERSION = "2023-09-17" + + +def update_terms(): + """ + Call the update_dwc_terms to get the latest version of tdwg dwc terms + Do we need to get a particular version of csv url to pass in?? + """ + Terms.update_dwc_terms() diff --git a/tests/test_listterms.py b/tests/test_listterms.py new file mode 100644 index 0000000..999a192 --- /dev/null +++ b/tests/test_listterms.py @@ -0,0 +1,42 @@ +import pandas as pd +from numpy import nan +from dwcahandler.dwca import DwcaHandler, Terms + +class TestTerms(): + """ + Test for terms + """ + + def test_list_dwc_terms(self): + """ + Test that mandatory terms are present + """ + df = DwcaHandler.list_dwc_terms() + assert df.query('term == "occurrenceID"').shape[0] == 1 + assert df.query('term == "basisOfRecord"').shape[0] == 1 + assert df.query('term == "scientificName"').shape[0] == 1 + assert df.query('term == "decimalLatitude"').shape[0] == 1 + assert df.query('term == "decimalLongitude"').shape[0] == 1 + assert df.query('term == "eventDate"').shape[0] == 1 + + def test_update_list_terms(self, mocker): + """ + Test that the terms are stored in the correctly and deprecated terms are not brought over + """ + mocker.patch('pandas.read_csv', + return_value=pd.DataFrame( + {"term_localName": ["occurrenceID", "basisOfRecord", + "scientificName", "oldTerm"], + "term_isDefinedBy": ["http://rs.tdwg.org/dwc/terms/", + "http://rs.tdwg.org/dwc/terms/", + "http://rs.tdwg.org/dwc/terms/", + "http://rs.tdwg.org/dwc/terms/"], + "term_deprecated": [nan, nan, nan, "true"]})) + mocker.patch('pandas.DataFrame.to_csv') + return_dwc_df = Terms.update_dwc_terms() + pd.testing.assert_frame_equal(return_dwc_df, + pd.DataFrame({"term": ["occurrenceID", "basisOfRecord", + "scientificName"], + "uri": ["http://rs.tdwg.org/dwc/terms/occurrenceID", + "http://rs.tdwg.org/dwc/terms/basisOfRecord", + "http://rs.tdwg.org/dwc/terms/scientificName"]}))