diff --git a/.gitignore b/.gitignore index 8c5f805..923fb3b 100644 --- a/.gitignore +++ b/.gitignore @@ -119,4 +119,6 @@ dmypy.json .vscode # Env configs -*.env \ No newline at end of file +*.env + +.DS_Store diff --git a/Pipfile b/Pipfile index c58d20c..ba09f38 100644 --- a/Pipfile +++ b/Pipfile @@ -13,7 +13,9 @@ python-dotenv = "*" [packages] flask = "*" +ftfy = "*" pyoai = "*" +psycopg2-binary = "*" requests = "*" python-dateutil = "*" lxml = "*" @@ -23,4 +25,4 @@ sentry-sdk = {extras = ["flask"],version = "*"} python-dotenv = "*" [requires] -python_version = "3.6" +python_version = "3.5" diff --git a/Pipfile.lock b/Pipfile.lock index 032c439..3a08a9b 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,11 +1,11 @@ { "_meta": { "hash": { - "sha256": "408ef5f3e6921b3359ecee3e8d293c34bd619136293244fe829928ab30c6cf06" + "sha256": "875a3074f25d395b53d43cb957d5d4567234ba291132d91c88cbc1a2f20c77b2" }, "pipfile-spec": 6, "requires": { - "python_version": "3.6" + "python_version": "3.5" }, "sources": [ { @@ -24,10 +24,10 @@ }, "certifi": { "hashes": [ - "sha256:e4f3620cfea4f83eedc95b24abd9cd56f3c4b146dd0177e83a21b4eb49e21e50", - "sha256:fd7c7c74727ddcf00e9acd26bba8da604ffec95bf1c2144e67aff7a8b50e6cef" + "sha256:017c25db2a153ce562900032d5bc68e9f191e44e9a0f762f373977de9df1fbb3", + "sha256:25b64c7da4cd7479594d035c08c2d809eb4aab3a26e5a990ea98cc450c320f1f" ], - "version": "==2019.9.11" + "version": "==2019.11.28" }, "chardet": { "hashes": [ @@ -53,10 +53,10 @@ }, "faker": { "hashes": [ - "sha256:5902379d8df308a204fc11c4f621590ee83975805a6c7b2228203b9defa45250", - "sha256:5e8c755c619f332d5ec28b7586389665f136bcf528e165eb925e87c06a63eda7" + "sha256:047d4d1791bfb3756264da670d99df13d799bb36e7d88774b1585a82d05dbaec", + "sha256:1b1a58961683b30c574520d0c739c4443e0ef6a185c04382e8cc888273dbebed" ], - "version": "==2.0.3" + "version": "==4.0.0" }, "flask": { "hashes": [ @@ -66,6 +66,12 @@ "index": "pypi", "version": "==1.1.1" }, + "ftfy": { + "hashes": [ + "sha256:51c7767f8c4b47d291fcef30b9625fb5341c06a31e6a3b627039c706c42f3720" + ], + "version": "==5.8" + }, "idna": { "hashes": [ "sha256:c357b3f628cf53ae2c4c05627ecc484553142ca23264e593d327bcde5e9c3407", @@ -82,50 +88,50 @@ }, "jinja2": { "hashes": [ - "sha256:74320bb91f31270f9551d46522e33af46a80c3d619f4a4bf42b3164d30b5911f", - "sha256:9fe95f19286cfefaa917656583d020be14e7859c6b0252588391e47db34527de" + "sha256:93187ffbc7808079673ef52771baa950426fd664d3aad1d0fa3e95644360e250", + "sha256:b0eaf100007721b5c16c1fc1eecb87409464edc10469ddc9a22a27a99123be49" ], - "version": "==2.10.3" + "version": "==2.11.1" }, "json-log-formatter": { "hashes": [ - "sha256:8bb02773a37274c08f4de748e3accbed269c8395d27149bc2e7e9109af342eee", - "sha256:cdc1f1dabc0b9c808ed4e4f26e73885a9e7955bf7190dd9f1b86be967feb5b29" + "sha256:ee187c9a80936cbf1259f73573973450fc24b84a4fb54e53eb0dcff86ea1e759" ], "index": "pypi", - "version": "==0.2.0" + "version": "==0.3.0" }, "lxml": { "hashes": [ - "sha256:02ca7bf899da57084041bb0f6095333e4d239948ad3169443f454add9f4e9cb4", - "sha256:096b82c5e0ea27ce9138bcbb205313343ee66a6e132f25c5ed67e2c8d960a1bc", - "sha256:0a920ff98cf1aac310470c644bc23b326402d3ef667ddafecb024e1713d485f1", - "sha256:1409b14bf83a7d729f92e2a7fbfe7ec929d4883ca071b06e95c539ceedb6497c", - "sha256:17cae1730a782858a6e2758fd20dd0ef7567916c47757b694a06ffafdec20046", - "sha256:17e3950add54c882e032527795c625929613adbd2ce5162b94667334458b5a36", - "sha256:1f4f214337f6ee5825bf90a65d04d70aab05526c08191ab888cb5149501923c5", - "sha256:2e8f77db25b0a96af679e64ff9bf9dddb27d379c9900c3272f3041c4d1327c9d", - "sha256:4dffd405390a45ecb95ab5ab1c1b847553c18b0ef8ed01e10c1c8b1a76452916", - "sha256:6b899931a5648862c7b88c795eddff7588fb585e81cecce20f8d9da16eff96e0", - "sha256:726c17f3e0d7a7200718c9a890ccfeab391c9133e363a577a44717c85c71db27", - "sha256:760c12276fee05c36f95f8040180abc7fbebb9e5011447a97cdc289b5d6ab6fc", - "sha256:796685d3969815a633827c818863ee199440696b0961e200b011d79b9394bbe7", - "sha256:891fe897b49abb7db470c55664b198b1095e4943b9f82b7dcab317a19116cd38", - "sha256:9277562f175d2334744ad297568677056861070399cec56ff06abbe2564d1232", - "sha256:a471628e20f03dcdfde00770eeaf9c77811f0c331c8805219ca7b87ac17576c5", - "sha256:a63b4fd3e2cabdcc9d918ed280bdde3e8e9641e04f3c59a2a3109644a07b9832", - "sha256:ae88588d687bd476be588010cbbe551e9c2872b816f2da8f01f6f1fda74e1ef0", - "sha256:b0b84408d4eabc6de9dd1e1e0bc63e7731e890c0b378a62443e5741cfd0ae90a", - "sha256:be78485e5d5f3684e875dab60f40cddace2f5b2a8f7fede412358ab3214c3a6f", - "sha256:c27eaed872185f047bb7f7da2d21a7d8913457678c9a100a50db6da890bc28b9", - "sha256:c7fccd08b14aa437fe096c71c645c0f9be0655a9b1a4b7cffc77bcb23b3d61d2", - "sha256:c81cb40bff373ab7a7446d6bbca0190bccc5be3448b47b51d729e37799bb5692", - "sha256:d11874b3c33ee441059464711cd365b89fa1a9cf19ae75b0c189b01fbf735b84", - "sha256:e9c028b5897901361d81a4718d1db217b716424a0283afe9d6735fe0caf70f79", - "sha256:fe489d486cd00b739be826e8c1be188ddb74c7a1ca784d93d06fda882a6a1681" + "sha256:06d4e0bbb1d62e38ae6118406d7cdb4693a3fa34ee3762238bcb96c9e36a93cd", + "sha256:0701f7965903a1c3f6f09328c1278ac0eee8f56f244e66af79cb224b7ef3801c", + "sha256:1f2c4ec372bf1c4a2c7e4bb20845e8bcf8050365189d86806bad1e3ae473d081", + "sha256:4235bc124fdcf611d02047d7034164897ade13046bda967768836629bc62784f", + "sha256:5828c7f3e615f3975d48f40d4fe66e8a7b25f16b5e5705ffe1d22e43fb1f6261", + "sha256:585c0869f75577ac7a8ff38d08f7aac9033da2c41c11352ebf86a04652758b7a", + "sha256:5d467ce9c5d35b3bcc7172c06320dddb275fea6ac2037f72f0a4d7472035cea9", + "sha256:63dbc21efd7e822c11d5ddbedbbb08cd11a41e0032e382a0fd59b0b08e405a3a", + "sha256:7bc1b221e7867f2e7ff1933165c0cec7153dce93d0cdba6554b42a8beb687bdb", + "sha256:8620ce80f50d023d414183bf90cc2576c2837b88e00bea3f33ad2630133bbb60", + "sha256:8a0ebda56ebca1a83eb2d1ac266649b80af8dd4b4a3502b2c1e09ac2f88fe128", + "sha256:90ed0e36455a81b25b7034038e40880189169c308a3df360861ad74da7b68c1a", + "sha256:95e67224815ef86924fbc2b71a9dbd1f7262384bca4bc4793645794ac4200717", + "sha256:afdb34b715daf814d1abea0317b6d672476b498472f1e5aacbadc34ebbc26e89", + "sha256:b4b2c63cc7963aedd08a5f5a454c9f67251b1ac9e22fd9d72836206c42dc2a72", + "sha256:d068f55bda3c2c3fcaec24bd083d9e2eede32c583faf084d6e4b9daaea77dde8", + "sha256:d5b3c4b7edd2e770375a01139be11307f04341ec709cf724e0f26ebb1eef12c3", + "sha256:deadf4df349d1dcd7b2853a2c8796593cc346600726eff680ed8ed11812382a7", + "sha256:df533af6f88080419c5a604d0d63b2c33b1c0c4409aba7d0cb6de305147ea8c8", + "sha256:e4aa948eb15018a657702fee0b9db47e908491c64d36b4a90f59a64741516e77", + "sha256:e5d842c73e4ef6ed8c1bd77806bf84a7cb535f9c0cf9b2c74d02ebda310070e1", + "sha256:ebec08091a22c2be870890913bdadd86fcd8e9f0f22bcb398abd3af914690c15", + "sha256:edc15fcfd77395e24543be48871c251f38132bb834d9fdfdad756adb6ea37679", + "sha256:f2b74784ed7e0bc2d02bd53e48ad6ba523c9b36c194260b7a5045071abbb1012", + "sha256:fa071559f14bd1e92077b1b5f6c22cf09756c6de7139370249eb372854ce51e6", + "sha256:fd52e796fee7171c4361d441796b64df1acfceb51f29e545e812f16d023c4bbc", + "sha256:fe976a0f1ef09b3638778024ab9fb8cde3118f203364212c198f71341c0715ca" ], "index": "pypi", - "version": "==4.4.1" + "version": "==4.5.0" }, "markupsafe": { "hashes": [ @@ -133,13 +139,16 @@ "sha256:09027a7803a62ca78792ad89403b1b7a73a01c8cb65909cd876f7fcebd79b161", "sha256:09c4b7f37d6c648cb13f9230d847adf22f8171b1ccc4d5682398e77f40309235", "sha256:1027c282dad077d0bae18be6794e6b6b8c91d58ed8a8d89a89d59693b9131db5", + "sha256:13d3144e1e340870b25e7b10b98d779608c02016d5184cfb9927a9f10c689f42", "sha256:24982cc2533820871eba85ba648cd53d8623687ff11cbb805be4ff7b4c971aff", "sha256:29872e92839765e546828bb7754a68c418d927cd064fd4708fab9fe9c8bb116b", "sha256:43a55c2930bbc139570ac2452adf3d70cdbb3cfe5912c71cdce1c2c6bbd9c5d1", "sha256:46c99d2de99945ec5cb54f23c8cd5689f6d7177305ebff350a58ce5f8de1669e", "sha256:500d4957e52ddc3351cabf489e79c91c17f6e0899158447047588650b5e69183", "sha256:535f6fc4d397c1563d08b88e485c3496cf5784e927af890fb3c3aac7f933ec66", + "sha256:596510de112c685489095da617b5bcbbac7dd6384aeebeda4df6025d0256a81b", "sha256:62fe6c95e3ec8a7fad637b7f3d372c15ec1caa01ab47926cfdf7a75b40e0eac1", + "sha256:6788b695d50a51edb699cb55e35487e430fa21f1ed838122d722e0ff0ac5ba15", "sha256:6dd73240d2af64df90aa7c4e7481e23825ea70af4b4922f8ede5b9e35f78a3b1", "sha256:717ba8fe3ae9cc0006d7c451f0bb265ee07739daf76355d06366154ee68d221e", "sha256:79855e1c5b8da654cf486b830bd42c06e8780cea587384cf6545b7d9ac013a0b", @@ -156,10 +165,50 @@ "sha256:ba59edeaa2fc6114428f1637ffff42da1e311e29382d81b339c1817d37ec93c6", "sha256:c8716a48d94b06bb3b2524c2b77e055fb313aeb4ea620c8dd03a105574ba704f", "sha256:cd5df75523866410809ca100dc9681e301e3c27567cf498077e8551b6d20e42f", - "sha256:e249096428b3ae81b08327a63a485ad0878de3fb939049038579ac0ef61e17e7" + "sha256:cdb132fc825c38e1aeec2c8aa9338310d29d337bebbd7baa06889d09a60a1fa2", + "sha256:e249096428b3ae81b08327a63a485ad0878de3fb939049038579ac0ef61e17e7", + "sha256:e8313f01ba26fbbe36c7be1966a7b7424942f670f38e666995b88d012765b9be" ], "version": "==1.1.1" }, + "psycopg2-binary": { + "hashes": [ + "sha256:040234f8a4a8dfd692662a8308d78f63f31a97e1c42d2480e5e6810c48966a29", + "sha256:086f7e89ec85a6704db51f68f0dcae432eff9300809723a6e8782c41c2f48e03", + "sha256:18ca813fdb17bc1db73fe61b196b05dd1ca2165b884dd5ec5568877cabf9b039", + "sha256:19dc39616850342a2a6db70559af55b22955f86667b5f652f40c0e99253d9881", + "sha256:2166e770cb98f02ed5ee2b0b569d40db26788e0bf2ec3ae1a0d864ea6f1d8309", + "sha256:3a2522b1d9178575acee4adf8fd9f979f9c0449b00b4164bb63c3475ea6528ed", + "sha256:3aa773580f85a28ffdf6f862e59cb5a3cc7ef6885121f2de3fca8d6ada4dbf3b", + "sha256:3b5deaa3ee7180585a296af33e14c9b18c218d148e735c7accf78130765a47e3", + "sha256:407af6d7e46593415f216c7f56ba087a9a42bd6dc2ecb86028760aa45b802bd7", + "sha256:4c3c09fb674401f630626310bcaf6cd6285daf0d5e4c26d6e55ca26a2734e39b", + "sha256:4c6717962247445b4f9e21c962ea61d2e884fc17df5ddf5e35863b016f8a1f03", + "sha256:50446fae5681fc99f87e505d4e77c9407e683ab60c555ec302f9ac9bffa61103", + "sha256:5057669b6a66aa9ca118a2a860159f0ee3acf837eda937bdd2a64f3431361a2d", + "sha256:5dd90c5438b4f935c9d01fcbad3620253da89d19c1f5fca9158646407ed7df35", + "sha256:659c815b5b8e2a55193ede2795c1e2349b8011497310bb936da7d4745652823b", + "sha256:69b13fdf12878b10dc6003acc8d0abf3ad93e79813fd5f3812497c1c9fb9be49", + "sha256:7a1cb80e35e1ccea3e11a48afe65d38744a0e0bde88795cc56a4d05b6e4f9d70", + "sha256:7e6e3c52e6732c219c07bd97fff6c088f8df4dae3b79752ee3a817e6f32e177e", + "sha256:7f42a8490c4fe854325504ce7a6e4796b207960dabb2cbafe3c3959cb00d1d7e", + "sha256:84156313f258eafff716b2961644a4483a9be44a5d43551d554844d15d4d224e", + "sha256:8578d6b8192e4c805e85f187bc530d0f52ba86c39172e61cd51f68fddd648103", + "sha256:890167d5091279a27e2505ff0e1fb273f8c48c41d35c5b92adbf4af80e6b2ed6", + "sha256:98e10634792ac0e9e7a92a76b4991b44c2325d3e7798270a808407355e7bb0a1", + "sha256:9aadff9032e967865f9778485571e93908d27dab21d0fdfdec0ca779bb6f8ad9", + "sha256:9f24f383a298a0c0f9b3113b982e21751a8ecde6615494a3f1470eb4a9d70e9e", + "sha256:a73021b44813b5c84eda4a3af5826dd72356a900bac9bd9dd1f0f81ee1c22c2f", + "sha256:afd96845e12638d2c44d213d4810a08f4dc4a563f9a98204b7428e567014b1cd", + "sha256:b73ddf033d8cd4cc9dfed6324b1ad2a89ba52c410ef6877998422fcb9c23e3a8", + "sha256:b8f490f5fad1767a1331df1259763b3bad7d7af12a75b950c2843ba319b2415f", + "sha256:dbc5cd56fff1a6152ca59445178652756f4e509f672e49ccdf3d79c1043113a4", + "sha256:eac8a3499754790187bb00574ab980df13e754777d346f85e0ff6df929bcd964", + "sha256:eaed1c65f461a959284649e37b5051224f4db6ebdc84e40b5e65f2986f101a08" + ], + "index": "pypi", + "version": "==2.8.4" + }, "pyoai": { "hashes": [ "sha256:029521e1f6a819511feb4299a6181b5c312e8a71f7cddc4547e27001e7552be0" @@ -177,11 +226,11 @@ }, "python-dotenv": { "hashes": [ - "sha256:debd928b49dbc2bf68040566f55cdb3252458036464806f4094487244e2a4093", - "sha256:f157d71d5fec9d4bd5f51c82746b6344dffa680ee85217c123f4a0c8117c4544" + "sha256:8429f459fc041237d98c9ff32e1938e7e5535b5ff24388876315a098027c3a57", + "sha256:ca9f3debf2262170d6f46571ce4d6ca1add60bb93b69c3a29dcb3d1a00a65c93" ], "index": "pypi", - "version": "==0.10.3" + "version": "==0.11.0" }, "requests": { "hashes": [ @@ -196,18 +245,18 @@ "flask" ], "hashes": [ - "sha256:09e1e8f00f22ea580348f83bbbd880adf40b29f1dec494a8e4b33e22f77184fb", - "sha256:ff1fa7fb85703ae9414c8b427ee73f8363232767c9cd19158f08f6e4f0b58fc7" + "sha256:b06dd27391fd11fb32f84fe054e6a64736c469514a718a99fb5ce1dff95d6b28", + "sha256:e023da07cfbead3868e1e2ba994160517885a32dfd994fc455b118e37989479b" ], "index": "pypi", - "version": "==0.13.2" + "version": "==0.14.1" }, "six": { "hashes": [ - "sha256:1f1b7d42e254082a9db6279deae68afb421ceba6158efa6131de7b3003ee93fd", - "sha256:30f610279e8b2578cab6db20741130331735c781b56053c59c4076da27f06b66" + "sha256:236bdbdce46e6e6a3d61a337c0f8b763ca1e8717c03b369e87a7ec7ce1319c0a", + "sha256:8f3cd2e254d8f793e7f3d6d9df77b92252b52637291d0f0da013c76ea2724b6c" ], - "version": "==1.13.0" + "version": "==1.14.0" }, "text-unidecode": { "hashes": [ @@ -218,17 +267,24 @@ }, "urllib3": { "hashes": [ - "sha256:3de946ffbed6e6746608990594d08faac602528ac7015ac28d33cee6a45b7398", - "sha256:9a107b99a5393caf59c7aa3c1249c16e6879447533d0887f4336dde834c7be86" + "sha256:2f3db8b19923a873b3e5256dc9c2dedfa883e33d87c690d9c7913e1f40673cdc", + "sha256:87716c2d2a7121198ebcb7ce7cccf6ce5e9ba539041cfbaeecfb641dc0bf6acc" ], - "version": "==1.25.6" + "version": "==1.25.8" }, "werkzeug": { "hashes": [ - "sha256:7280924747b5733b246fe23972186c6b348f9ae29724135a6dfc1e53cea433e7", - "sha256:e5f4a1f98b52b18a93da705a7458e55afb26f32bff83ff5d19189f92462d65c4" + "sha256:169ba8a33788476292d04186ab33b01d6add475033dfc07215e6d219cc077096", + "sha256:6dc65cf9091cf750012f56f2cad759fa9e879f511b5ff8685e456b4e3bf90d16" + ], + "version": "==1.0.0" + }, + "wcwidth": { + "hashes": [ + "sha256:c4d647b99872929fdb7bdcaa4fbe7f01413ed3d98077df798530e5b04f116c83", + "sha256:beb4802a9cebb9144e99086eff703a642a13d6a0052920003a230f3294bbe784" ], - "version": "==0.16.0" + "version": "==0.2.5" } }, "develop": { @@ -244,6 +300,7 @@ "sha256:03472c30eb2c5d1ba9227e4c2ca66ab8287fbfbbda3888aa93dc2e28fc6811b4", "sha256:75a9445bac02d8d058d5e1fe689654ba5a6556a1dfd8ce6ec55a0ed79866cfa6" ], + "markers": "sys_platform == 'win32'", "version": "==1.3.0" }, "attrs": { @@ -253,6 +310,14 @@ ], "version": "==19.3.0" }, + "colorama": { + "hashes": [ + "sha256:7d73d2a99753107a36ac6b455ee49046802e59d9d076ef8e47b61499fa29afff", + "sha256:e96da0d330793e2cb9485e9ddfd918d456036c7149416295932478192f4436a1" + ], + "markers": "sys_platform == 'win32'", + "version": "==0.4.3" + }, "factory-boy": { "hashes": [ "sha256:728df59b372c9588b83153facf26d3d28947fc750e8e3c95cefa9bed0e6394ee", @@ -263,18 +328,18 @@ }, "faker": { "hashes": [ - "sha256:5902379d8df308a204fc11c4f621590ee83975805a6c7b2228203b9defa45250", - "sha256:5e8c755c619f332d5ec28b7586389665f136bcf528e165eb925e87c06a63eda7" + "sha256:047d4d1791bfb3756264da670d99df13d799bb36e7d88774b1585a82d05dbaec", + "sha256:1b1a58961683b30c574520d0c739c4443e0ef6a185c04382e8cc888273dbebed" ], - "version": "==2.0.3" + "version": "==4.0.0" }, "importlib-metadata": { "hashes": [ - "sha256:aa18d7378b00b40847790e7c27e11673d7fed219354109d0e7b9e5b25dc3ad26", - "sha256:d5f18a79777f3aa179c145737780282e27b508fc8fd688cb17c7a813e8bd39af" + "sha256:06f5b3a99029c7134207dd882428a66992a9de2bef7c2b699b5641f9886c3302", + "sha256:b97607a1a18a5100839aec1dc26a1ea17ee0d93b20b0f008d80a5a050afb200b" ], "markers": "python_version < '3.8'", - "version": "==0.23" + "version": "==1.5.0" }, "isort": { "hashes": [ @@ -318,62 +383,70 @@ }, "more-itertools": { "hashes": [ - "sha256:409cd48d4db7052af495b09dec721011634af3753ae1ef92d2b32f73a745f832", - "sha256:92b8c4b06dac4f0611c0729b2f2ede52b2e1bac1ab48f089c7ddc12e26bb60c4" + "sha256:5dd8bcf33e5f9513ffa06d5ad33d78f31e1931ac9a18f33d37e77a180d393a7c", + "sha256:b1ddb932186d8a6ac451e1d95844b382f55e12686d51ca0c68b6f61f2ab7a507" ], - "version": "==7.2.0" + "version": "==8.2.0" }, "packaging": { "hashes": [ - "sha256:28b924174df7a2fa32c1953825ff29c61e2f5e082343165438812f00d3a7fc47", - "sha256:d9551545c6d761f3def1677baf08ab2a3ca17c56879e70fecba2fc4dde4ed108" + "sha256:170748228214b70b672c581a3dd610ee51f733018650740e98c7df862a583f73", + "sha256:e665345f9eef0c621aa0bf2f8d78cf6d21904eef16a93f020240b704a57f1334" + ], + "version": "==20.1" + }, + "pathlib2": { + "hashes": [ + "sha256:0ec8205a157c80d7acc301c0b18fbd5d44fe655968f5d947b6ecef5290fc35db", + "sha256:6cd9a47b597b37cc57de1c05e56fb1a1c9cc9fab04fe78c29acd090418529868" ], - "version": "==19.2" + "markers": "python_version < '3.6'", + "version": "==2.3.5" }, "pluggy": { "hashes": [ - "sha256:0db4b7601aae1d35b4a033282da476845aa19185c1e6964b25cf324b5e4ec3e6", - "sha256:fa5fa1622fa6dd5c030e9cad086fa19ef6a0cf6d7a2d12318e10cb49d6d68f34" + "sha256:15b2acde666561e1298d71b523007ed7364de07029219b604cf808bfa1c765b0", + "sha256:966c145cd83c96502c3c3868f50408687b38434af77734af1e9ca461a4081d2d" ], - "version": "==0.13.0" + "version": "==0.13.1" }, "py": { "hashes": [ - "sha256:64f65755aee5b381cea27766a3a147c3f15b9b6b9ac88676de66ba2ae36793fa", - "sha256:dc639b046a6e2cff5bbe40194ad65936d6ba360b52b3c3fe1d08a82dd50b5e53" + "sha256:5e27081401262157467ad6e7f851b7aa402c5852dbcb3dae06768434de5752aa", + "sha256:c20fdd83a5dbc0af9efd622bee9a5564e278f6380fffcacc43ba6f43db2813b0" ], - "version": "==1.8.0" + "version": "==1.8.1" }, "pylint": { "hashes": [ - "sha256:7b76045426c650d2b0f02fc47c14d7934d17898779da95288a74c2a7ec440702", - "sha256:856476331f3e26598017290fd65bebe81c960e806776f324093a46b76fb2d1c0" + "sha256:3db5468ad013380e987410a8d6956226963aed94ecb5f9d3a28acca6d9ac36cd", + "sha256:886e6afc935ea2590b462664b161ca9a5e40168ea99e5300935f6591ad467df4" ], "index": "pypi", - "version": "==2.4.3" + "version": "==2.4.4" }, "pyparsing": { "hashes": [ - "sha256:4acadc9a2b96c19fe00932a38ca63e601180c39a189a696abce1eaab641447e1", - "sha256:61b5ed888beab19ddccab3478910e2076a6b5a0295dffc43021890e136edf764" + "sha256:4c830582a84fb022400b85429791bc551f1f4871c33f23e44f353119e92f969f", + "sha256:c342dccb5250c08d45fd6f8b4a559613ca603b57498511740e65cd11a2e7dcec" ], - "version": "==2.4.4" + "version": "==2.4.6" }, "pytest": { "hashes": [ - "sha256:27abc3fef618a01bebb1f0d6d303d2816a99aa87a5968ebc32fe971be91eb1e6", - "sha256:58cee9e09242937e136dbb3dab466116ba20d6b7828c7620f23947f37eb4dae4" + "sha256:0d5fe9189a148acc3c3eb2ac8e1ac0742cb7618c084f3d228baaec0c254b318d", + "sha256:ff615c761e25eb25df19edddc0b970302d2a9091fbce0e7213298d85fb61fef6" ], "index": "pypi", - "version": "==5.2.2" + "version": "==5.3.5" }, "pytest-mock": { "hashes": [ - "sha256:b3514caac35fe3f05555923eabd9546abce11571cc2ddf7d8615959d04f2c89e", - "sha256:ea502c3891599c26243a3a847ccf0b1d20556678c528f86c98e3cd6d40c5cf11" + "sha256:b35eb281e93aafed138db25c8772b95d3756108b601947f89af503f8c629413f", + "sha256:cb67402d87d5f53c579263d37971a164743dc33c159dfb4fb4a86f37c5552307" ], "index": "pypi", - "version": "==1.11.2" + "version": "==2.0.0" }, "python-dateutil": { "hashes": [ @@ -385,18 +458,18 @@ }, "python-dotenv": { "hashes": [ - "sha256:debd928b49dbc2bf68040566f55cdb3252458036464806f4094487244e2a4093", - "sha256:f157d71d5fec9d4bd5f51c82746b6344dffa680ee85217c123f4a0c8117c4544" + "sha256:8429f459fc041237d98c9ff32e1938e7e5535b5ff24388876315a098027c3a57", + "sha256:ca9f3debf2262170d6f46571ce4d6ca1add60bb93b69c3a29dcb3d1a00a65c93" ], "index": "pypi", - "version": "==0.10.3" + "version": "==0.11.0" }, "six": { "hashes": [ - "sha256:1f1b7d42e254082a9db6279deae68afb421ceba6158efa6131de7b3003ee93fd", - "sha256:30f610279e8b2578cab6db20741130331735c781b56053c59c4076da27f06b66" + "sha256:236bdbdce46e6e6a3d61a337c0f8b763ca1e8717c03b369e87a7ec7ce1319c0a", + "sha256:8f3cd2e254d8f793e7f3d6d9df77b92252b52637291d0f0da013c76ea2724b6c" ], - "version": "==1.13.0" + "version": "==1.14.0" }, "termcolor": { "hashes": [ @@ -414,36 +487,37 @@ }, "typed-ast": { "hashes": [ - "sha256:1170afa46a3799e18b4c977777ce137bb53c7485379d9706af8a59f2ea1aa161", - "sha256:18511a0b3e7922276346bcb47e2ef9f38fb90fd31cb9223eed42c85d1312344e", - "sha256:262c247a82d005e43b5b7f69aff746370538e176131c32dda9cb0f324d27141e", - "sha256:2b907eb046d049bcd9892e3076c7a6456c93a25bebfe554e931620c90e6a25b0", - "sha256:354c16e5babd09f5cb0ee000d54cfa38401d8b8891eefa878ac772f827181a3c", - "sha256:48e5b1e71f25cfdef98b013263a88d7145879fbb2d5185f2a0c79fa7ebbeae47", - "sha256:4e0b70c6fc4d010f8107726af5fd37921b666f5b31d9331f0bd24ad9a088e631", - "sha256:630968c5cdee51a11c05a30453f8cd65e0cc1d2ad0d9192819df9978984529f4", - "sha256:66480f95b8167c9c5c5c87f32cf437d585937970f3fc24386f313a4c97b44e34", - "sha256:71211d26ffd12d63a83e079ff258ac9d56a1376a25bc80b1cdcdf601b855b90b", - "sha256:7954560051331d003b4e2b3eb822d9dd2e376fa4f6d98fee32f452f52dd6ebb2", - "sha256:838997f4310012cf2e1ad3803bce2f3402e9ffb71ded61b5ee22617b3a7f6b6e", - "sha256:95bd11af7eafc16e829af2d3df510cecfd4387f6453355188342c3e79a2ec87a", - "sha256:bc6c7d3fa1325a0c6613512a093bc2a2a15aeec350451cbdf9e1d4bffe3e3233", - "sha256:cc34a6f5b426748a507dd5d1de4c1978f2eb5626d51326e43280941206c209e1", - "sha256:d755f03c1e4a51e9b24d899561fec4ccaf51f210d52abdf8c07ee2849b212a36", - "sha256:d7c45933b1bdfaf9f36c579671fec15d25b06c8398f113dab64c18ed1adda01d", - "sha256:d896919306dd0aa22d0132f62a1b78d11aaf4c9fc5b3410d3c666b818191630a", - "sha256:fdc1c9bbf79510b76408840e009ed65958feba92a88833cdceecff93ae8fff66", - "sha256:ffde2fbfad571af120fcbfbbc61c72469e72f550d676c3342492a9dfdefb8f12" + "sha256:0666aa36131496aed8f7be0410ff974562ab7eeac11ef351def9ea6fa28f6355", + "sha256:0c2c07682d61a629b68433afb159376e24e5b2fd4641d35424e462169c0a7919", + "sha256:249862707802d40f7f29f6e1aad8d84b5aa9e44552d2cc17384b209f091276aa", + "sha256:24995c843eb0ad11a4527b026b4dde3da70e1f2d8806c99b7b4a7cf491612652", + "sha256:269151951236b0f9a6f04015a9004084a5ab0d5f19b57de779f908621e7d8b75", + "sha256:4083861b0aa07990b619bd7ddc365eb7fa4b817e99cf5f8d9cf21a42780f6e01", + "sha256:498b0f36cc7054c1fead3d7fc59d2150f4d5c6c56ba7fb150c013fbc683a8d2d", + "sha256:4e3e5da80ccbebfff202a67bf900d081906c358ccc3d5e3c8aea42fdfdfd51c1", + "sha256:6daac9731f172c2a22ade6ed0c00197ee7cc1221aa84cfdf9c31defeb059a907", + "sha256:715ff2f2df46121071622063fc7543d9b1fd19ebfc4f5c8895af64a77a8c852c", + "sha256:73d785a950fc82dd2a25897d525d003f6378d1cb23ab305578394694202a58c3", + "sha256:8c8aaad94455178e3187ab22c8b01a3837f8ee50e09cf31f1ba129eb293ec30b", + "sha256:8ce678dbaf790dbdb3eba24056d5364fb45944f33553dd5869b7580cdbb83614", + "sha256:aaee9905aee35ba5905cfb3c62f3e83b3bec7b39413f0a7f19be4e547ea01ebb", + "sha256:bcd3b13b56ea479b3650b82cabd6b5343a625b0ced5429e4ccad28a8973f301b", + "sha256:c9e348e02e4d2b4a8b2eedb48210430658df6951fa484e59de33ff773fbd4b41", + "sha256:d205b1b46085271b4e15f670058ce182bd1199e56b317bf2ec004b6a44f911f6", + "sha256:d43943ef777f9a1c42bf4e552ba23ac77a6351de620aa9acf64ad54933ad4d34", + "sha256:d5d33e9e7af3b34a40dc05f498939f0ebf187f07c385fd58d591c533ad8562fe", + "sha256:fc0fea399acb12edbf8a628ba8d2312f583bdbdb3335635db062fa98cf71fca4", + "sha256:fe460b922ec15dd205595c9b5b99e2f056fd98ae8f9f56b888e7a17dc2b757e7" ], "markers": "implementation_name == 'cpython' and python_version < '3.8'", - "version": "==1.4.0" + "version": "==1.4.1" }, "wcwidth": { "hashes": [ - "sha256:3df37372226d6e63e1b1e1eda15c594bca98a22d33a23832a90998faa96bc65e", - "sha256:f4ebe71925af7b40a864553f761ed559b43544f8f71746c2d756c7fe788ade7c" + "sha256:8fd29383f539be45b20bd4df0dc29c20ba48654a41e661925e612311e9f3c603", + "sha256:f28b3e8a6483e5d49e7f8949ac1a78314e740333ae305b4ba5defd3e74fb37a8" ], - "version": "==0.1.7" + "version": "==0.1.8" }, "wrapt": { "hashes": [ @@ -453,10 +527,10 @@ }, "zipp": { "hashes": [ - "sha256:3718b1cbcd963c7d4c5511a8240812904164b7f381b647143a89d3b98f9bcd8e", - "sha256:f06903e9f1f43b12d371004b4ac7b06ab39a44adc747266928ae6debfa7b3335" + "sha256:15428d652e993b6ce86694c3cccf0d71aa7afdc6ef1807fa25a920e9444e0281", + "sha256:d9d2efe11d3a3fb9184da550d35bd1319dc8e30a63255927c82bb42fca1f4f7c" ], - "version": "==0.6.0" + "version": "==1.1.0" } } } diff --git a/vendor/docker/env.conf b/vendor/docker/env.conf index cb67f6e..c4f48e5 100644 --- a/vendor/docker/env.conf +++ b/vendor/docker/env.conf @@ -3,7 +3,13 @@ env OAIPMH_BASE_URL; env DATACITE_API_URL; env OAIPMH_REPOS_NAME; env OAIPMH_ADMIN_EMAIL; +env OAIPMH_IDENTIFIER; env SENTRY_DSN; env DATACITE_API_ADMIN_USERNAME; env DATACITE_API_ADMIN_PASSWORD; env RESULT_SET_SIZE; +env CATALOG_SET; +env POSTGRES_SERVER; +env POSTGRES_DB; +env POSTGRES_USER; +env POSTGRES_PASSWORD; diff --git a/viringo/catalogs.py b/viringo/catalogs.py index 6ea74eb..78729aa 100644 --- a/viringo/catalogs.py +++ b/viringo/catalogs.py @@ -12,6 +12,7 @@ from viringo import config from .services import datacite +from .services import frdr class DataCiteOAIServer(): @@ -285,6 +286,267 @@ def build_metadata_map(self, result): return metadata +class FRDROAIServer(): + """Build OAI-PMH responses from the FRDR Postgres server""" + def identify(self): + """Construct common identification for the OAI service""" + + identify = common.Identify( + repositoryName=config.OAIPMH_REPOS_NAME, + baseURL=config.OAIPMH_BASE_URL, + protocolVersion="2.0", + adminEmails=[config.OAIPMH_ADMIN_EMAIL], + earliestDatestamp=datetime(2011, 1, 1), + deletedRecord='no', + granularity='YYYY-MM-DDThh:mm:ssZ', + compression=['gzip', 'deflate'], + toolkit_description=False) + + # Specify a custom description + frdr_desc = """ + + oai + """ + config.OAIPMH_IDENTIFIER + """ + : + oai""" + config.OAIPMH_IDENTIFIER + """:1 + + """ + + identify.add_description(xml_string=frdr_desc) + + return identify + + def listMetadataFormats(self, identifier=None): + #pylint: disable=no-self-use,invalid-name + """Returns metadata formats available for the repository + + Identifier does nothing as our repository responds in all formats for all dois + """ + # PyOAI Expects result format (metadataPrefix, schema, metadataNamespace) + + format_oai_dc = ( + 'oai_dc', + 'http://www.openarchives.org/OAI/2.0/oai_dc.xsd', + 'http://www.openarchives.org/OAI/2.0/oai_dc/' + ) + + format_oai_datacite = ( + 'oai_datacite', + 'http://schema.datacite.org/oai/oai-1.1/oai.xsd', + 'http://schema.datacite.org/oai/oai-1.1/' + ) + + format_datacite = ( + 'datacite', + 'http://schema.datacite.org/meta/nonexistant/nonexistant.xsd', + 'http://datacite.org/schema/nonexistant' + ) + + return [format_oai_dc, format_oai_datacite, format_datacite] + + def getRecord(self, metadataPrefix, identifier): + #pylint: disable=no-self-use,invalid-name + """Returns pyoai data tuple for specific record""" + + # Should we implement this based on source_url and local_identifier the way we currently do for the harvester? + + result = frdr.get_metadata(identifier, db=config.POSTGRES_DB, user=config.POSTGRES_USER, password=config.POSTGRES_PASSWORD, server=config.POSTGRES_SERVER, port=config.POSTGRES_PORT) + if not result: + raise error.IdDoesNotExistError( + "\"%s\" is unknown or illegal in this repository" % identifier + ) + + # Build metadata based on requested format and result + metadata = self.build_metadata_map(result) + + header = self.build_header(result) + record = self.build_record(metadata) + data = ( + header, + record, + None # About string - not used + ) + + return data + + def listRecords( + self, + metadataPrefix=None, + from_=None, + until=None, + set=None, + paging_cursor=None + ): + + #pylint: disable=no-self-use,invalid-name + """Returns pyoai data tuple for list of records""" + + # If available get the search query from the set param + search_query = set_to_search_query(set) + + results, total_records, paging_cursor = frdr.get_metadata_list( + server=config.POSTGRES_SERVER, + db=config.POSTGRES_DB, + user=config.POSTGRES_USER, + password=config.POSTGRES_PASSWORD, + port=config.POSTGRES_PORT, + query=search_query, + set=set, + from_datetime=from_, + until_datetime=until, + cursor=paging_cursor + ) + + if paging_cursor >= total_records: + paging_cursor = None + + records = [] + if results: + for result in results: + # Build metadata based on requested format and result + metadata = self.build_metadata_map(result) + + header = self.build_header(result) + record = self.build_record(metadata) + + data = ( + header, + record, + None # About string - not used + ) + + records.append(data) + + # This differs from the pyoai implementation in that we have to return a cursor here + # But this is okay as we have a custom server to handle it. + return records, total_records, paging_cursor + + def listIdentifiers( + self, + metadataPrefix=None, + from_=None, + until=None, + set=None, + paging_cursor=None + ): + #pylint: disable=no-self-use,invalid-name + """Returns pyoai data tuple for list of identifiers""" + + # If available get the search query from the set param + search_query = set_to_search_query(set) + + results, total_records, paging_cursor = frdr.get_metadata_list( + server=config.POSTGRES_SERVER, + db=config.POSTGRES_DB, + user=config.POSTGRES_USER, + password=config.POSTGRES_PASSWORD, + port=config.POSTGRES_PORT, + query=search_query, + set=set, + from_datetime=from_, + until_datetime=until, + cursor=paging_cursor + ) + + if paging_cursor >= total_records: + paging_cursor = None + + records = [] + if results: + for result in results: + header = self.build_header(result) + + records.append(header) + + # This differs from the pyoai implementation in that we have to return a cursor here + # But this is okay as we have a custom server to handle it. + return records, total_records, paging_cursor + + def listSets( + self, + paging_cursor=0 + ): + #pylint: disable=no-self-use,invalid-name + """Returns pyoai data tuple for list of sets""" + + # Note this implementation is not super efficient as we request + # the full set everytime regardles of actual paging + # The paging is handled just by offsetting the records returned. + # This is however acceptable given sets are a small subset of data. + + # We know we're always dealing with a integer value here + paging_cursor = int(paging_cursor) + + batch_size = 50 + next_batch = paging_cursor + batch_size + results, total_results = frdr.get_sets(db=config.POSTGRES_DB, user=config.POSTGRES_USER, password=config.POSTGRES_PASSWORD, server=config.POSTGRES_SERVER, port=config.POSTGRES_PORT) + results = results[paging_cursor: next_batch] + + if len(results) < batch_size: + paging_cursor = None + else: + paging_cursor = next_batch + + records = [] + if results: + for identifier, name in results: + # Format of a set is setSpec, setName, setDescription + records.append((identifier, name, None)) + + # This differs from the pyoai implementation in that we have to return a cursor here + # But this is okay as we have a custom server to handle it. + return records, total_results, paging_cursor + + def build_header(self, result): + """Construct a OAI-PMH record header""" + + return common.Header( + None, + str(result.identifier), + result.updated_datetime, + setspec=[result.client], + deleted=not result.active + ) + + def build_record(self, metadata): + """Construct a OAI-PMH payload for a record""" + + return common.Metadata( + None, + metadata + ) + + def build_metadata_map(self, result): + """Construct a metadata map object for oai metadata writing""" + identifiers = result.identifiers + + relations = [ + identifier_to_string(relation) + for relation in result.relations + ] + + metadata = { + 'title': result.titles, + 'creator': result.creators, + 'subject': result.subjects, + 'description': result.descriptions, + 'publisher': [result.publisher] if result.publisher else [], + 'contributor': result.contributors, + 'date': result.dates, + 'type': result.resource_types, + 'format': result.formats, + 'identifier': identifiers, + 'relation': relations, + 'language': [result.language] if result.language else [], + 'rights': result.rights, + 'xml': result.xml, + 'set': result.client, + 'metadata_version': result.metadata_version + } + + return metadata + + def set_to_search_query(unparsed_set): """Take a oai set and extract any base64url encoded search query""" @@ -298,7 +560,6 @@ def set_to_search_query(unparsed_set): return "" - def set_to_provider_client(unparsed_set): """Take a oai set and convert into provider_id and client_id""" diff --git a/viringo/config.py b/viringo/config.py index 49ad666..ef397d6 100644 --- a/viringo/config.py +++ b/viringo/config.py @@ -16,5 +16,19 @@ OAIPMH_BASE_URL = os.getenv('OAIPMH_BASE_URL', 'https://oai.datacite.org/oai') # Admin e-mail for the OAI-PMH service OAIPMH_ADMIN_EMAIL = os.getenv('OAIPMH_ADMIN_EMAIL', 'support@datacite.org') +# OAI repository identifier +OAIPMH_IDENTIFIER = os.getenv('OAIPMH_IDENTIFIER', 'oai.datacite.org') # Page size of results shown for result listings RESULT_SET_SIZE = int(os.getenv('RESULT_SET_SIZE', '50')) +# Source metadata catalog (DataCite or FRDR) +CATALOG_SET = os.getenv('OAIPMH_CATALOG', 'DataCite') +# FRDR Postgres server +POSTGRES_SERVER = os.getenv('OAIPMH_POSTGRES_SERVER', '') +# FRDR Postgres db +POSTGRES_DB = os.getenv('OAIPMH_POSTGRES_DB', '') +# FRDR Postgres user +POSTGRES_USER = os.getenv('OAIPMH_POSTGRES_USER', '') +# FRDR Postgres password +POSTGRES_PASSWORD = os.getenv('OAIPMH_POSTGRES_PASSWORD', '') +# FRDR Postgres port +POSTGRES_PORT = os.getenv('OAIPMH_POSTGRES_PORT', '5432') \ No newline at end of file diff --git a/viringo/metadata.py b/viringo/metadata.py index 4ea2ee8..5c2f6d0 100644 --- a/viringo/metadata.py +++ b/viringo/metadata.py @@ -1,6 +1,7 @@ """This module deals with handling the representation of metadata formats for OAI""" import re +import ftfy from lxml import etree NS_OAIPMH = 'http://www.openarchives.org/OAI/2.0/' @@ -43,10 +44,20 @@ def nsdc(name): ]: for value in _map.get(name, []): if value: + if isinstance(value, list): + if len(value) == 1: + value = value[0] + else: + value = str(value) new_element = etree.SubElement(e_dc, nsdc(name)) - # The regular expression here is to filter only valid XML chars - # Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] - new_element.text = re.sub(u'[^\u0020-\uD7FF\u0009\u000A\u000D\uE000-\uFFFD\U00010000-\U0010FFFF]+', '', value) + if isinstance(value, str): + try: + value = value.replace('\x0c', " ") + new_element.text = ftfy.fix_text(value) + except: + new_element.text = '' + else: + new_element.text = '' def datacite_writer(element: etree.Element, metadata): """Writer for writing data in a metadata object out into raw datacite format""" @@ -62,7 +73,10 @@ def oai_datacite_writer(element: etree.Element, metadata): _map = metadata.getMap() raw_xml = _map.get('xml', '') - xml_resource_element = etree.fromstring(raw_xml) + try: + xml_resource_element = etree.fromstring(raw_xml) + except: + print(raw_xml) e_oai_datacite = etree.SubElement( element, "oai_datacite", {'xmlns': 'http://schema.datacite.org/oai/oai-1.1/'}, diff --git a/viringo/oai.py b/viringo/oai.py index 41f3377..5e4fbc7 100644 --- a/viringo/oai.py +++ b/viringo/oai.py @@ -12,7 +12,11 @@ import oaipmh.datestamp from .catalogs import DataCiteOAIServer +from .catalogs import FRDROAIServer from . import metadata +from . import config + +import sys BP = Blueprint('oai', __name__) @@ -93,7 +97,10 @@ def handleVerb(self, verb, kw): def get_oai_server(): """Returns a pyoai server object that can process and return OAI requests""" if 'oai' not in g: - catalog_server = DataCiteOAIServer() + if config.CATALOG_SET == 'FRDR': + catalog_server = FRDROAIServer() + else: + catalog_server = DataCiteOAIServer() metadata_registry = oaipmh.metadata.MetadataRegistry() metadata_registry.registerWriter('oai_dc', metadata.oai_dc_writer) diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py new file mode 100644 index 0000000..eea4346 --- /dev/null +++ b/viringo/services/frdr.py @@ -0,0 +1,525 @@ +"""Handles DB queries for retrieving metadata""" + +import psycopg2 +import re +from datetime import datetime +import dateutil.parser +import dateutil.tz +from viringo import config +import xml.etree.cElementTree as ET +import ftfy + +class Metadata: + """Represents a DataCite metadata resultset""" + def __init__( + self, + identifier=None, + created_datetime=None, + updated_datetime=None, + xml=None, + metadata_version=None, + titles=None, + creators=None, + subjects=None, + descriptions=None, + publisher=None, + publication_year=None, + dates=None, + contributors=None, + resource_types=None, + funding_references=None, + geo_locations=None, + formats=None, + identifiers=None, + language=None, + relations=None, + rights=None, + sizes=None, + client=None, + active=True + ): + + self.identifier = identifier + self.created_datetime = created_datetime or datetime.min + self.updated_datetime = updated_datetime or datetime.min + self.xml = xml + self.metadata_version = metadata_version + self.titles = titles or [] + self.creators = creators or [] + self.subjects = subjects or [] + self.descriptions = descriptions or [] + self.publisher = publisher + self.publication_year = publication_year + self.dates = dates or [] + self.contributors = contributors or [] + self.resource_types = resource_types or [] + self.funding_references = funding_references or [] + self.geo_locations = geo_locations or [] + self.formats = formats or [] + self.identifiers = identifiers or [] + self.language = language + self.relations = relations or [] + self.rights = rights or [] + self.sizes = sizes or [] + self.client = client + self.active = active + +def xml_fix_text(text): + if isinstance(text, str) and len(text) > 0: + text = text.replace('\x0c', " ") + return ftfy.fix_text(text) + else: + return '' + +def construct_datacite_xml(data): + resource = ET.Element("resource") + resource.set("xmlns", "http://datacite.org/schema/kernel-4") + resource.set("xmlns:xsi", "http://www.w3.org/2001/XMLSchema-instance") + resource.set("xsi:schemaLocation", + "http://datacite.org/schema/kernel-4 http://schema.datacite.org/meta/kernel-4/metadata.xsd") + + # Add resource URL as identifier + identifier = ET.SubElement(resource, "identifier") + if "doi.org/" in data['item_url']: + identifier.set("identifierType", "DOI") + identifier.text = xml_fix_text(data['item_url'].split("doi.org/")[1]) + else: + identifier.set("identifierType", "URL") + identifier.text = xml_fix_text(data['item_url']) + + + # Add creators + creators = ET.SubElement(resource, "creators") + for creator_entry in data['dc:contributor.author']: + creator = ET.SubElement(creators, "creator") + creatorName = ET.SubElement(creator, "creatorName") + creatorName.text = xml_fix_text(creator_entry) + + # Add titles + titles = ET.SubElement(resource, "titles") + if data['title_en'] != "": + title = ET.SubElement(titles, "title") + title.text = xml_fix_text(data['title_en']) + title.set("xml:lang", "en") + if data['title_fr'] != "": + title = ET.SubElement(titles, "title") + title.text = xml_fix_text(data['title_fr']) + title.set("xml:lang", "fr") + if data['title_en'] != "": + title.set("titleType", "TranslatedTitle") + + # Add publisher + publisher = ET.SubElement(resource, "publisher") + publisher.text = xml_fix_text(data['repository_name']) + + # Add publication year + publicationyear = ET.SubElement(resource, "publicationYear") + publicationyear.text = xml_fix_text(data['pub_date'][:4]) + + # Add subjects + subject_and_tags = [] + subjects = ET.SubElement(resource, "subjects") + for subject_entry in data['frdr:category_en']: + if subject_entry not in subject_and_tags and subject_entry != "": + subject_and_tags.append(subject_entry) + subject = ET.SubElement(subjects, "subject") + subject.set("xml:lang", "en") + subject.text = xml_fix_text(subject_entry) + for subject_entry in data['frdr:category_fr']: + if subject_entry not in subject_and_tags and subject_entry != "": + subject_and_tags.append(subject_entry) + subject = ET.SubElement(subjects, "subject") + subject.set("xml:lang", "fr") + subject.text = xml_fix_text(subject_entry) + for subject_entry in data['frdr:keywords_en']: + if subject_entry not in subject_and_tags and subject_entry != "": + subject_and_tags.append(subject_entry) + subject = ET.SubElement(subjects, "subject") + subject.set("xml:lang", "en") + subject.text = xml_fix_text(subject_entry) + for subject_entry in data['frdr:keywords_fr']: + if subject_entry not in subject_and_tags and subject_entry != "": + subject_and_tags.append(subject_entry) + subject = ET.SubElement(subjects, "subject") + subject.set("xml:lang", "fr") + subject.text = xml_fix_text(subject_entry) + + # If subjects is empty, remove it + if len(subjects) == 0: + resource.remove(subjects) + + # Add contributors (contributorType "Other") + contributors = ET.SubElement(resource, "contributors") + for contributor_entry in data["dc:contributor"]: + contributor = ET.SubElement(contributors, "contributor") + contributor.set("contributorType", "Other") + contributorName = ET.SubElement(contributor, "contributorName") + contributorName.text = xml_fix_text(contributor_entry) + + # Add FRDR as HostingInstituton + contributor_en = ET.SubElement(contributors, "contributor") + contributor_en.set("contributorType", "HostingInstitution") + contributor_en.set("xml:lang", "en") + contributorName_en = ET.SubElement(contributor_en, "contributorName") + contributorName_en.text = "Federated Research Data Repository" + contributor_fr = ET.SubElement(contributors, "contributor") + contributor_fr.set("contributorType", "HostingInstitution") + contributor_fr.set("xml:lang", "fr") + contributorName_fr = ET.SubElement(contributor_fr, "contributorName") + contributorName_fr.text = "Dépôt fédéré de données de recherche" + + # Add dates + dates = ET.SubElement(resource, "dates") + date = ET.SubElement(dates, "date") + date.set("dateType", "Issued") + date.text = xml_fix_text(data['pub_date']) + + # Add resourceType + resourceType = ET.SubElement(resource, "resourceType") + resourceType.set("resourceTypeGeneral", "Dataset") + resourceType.text = "Dataset" + + # Add alternateIdentifiers + alternateIdentifiers = ET.SubElement(resource, "alternateIdentifiers") + alternateIdentifier = ET.SubElement(alternateIdentifiers, "alternateIdentifier") + alternateIdentifier.set("alternateIdentifierType", "local") + alternateIdentifier.text = xml_fix_text(data['local_identifier']) + + # Add rightsList + rightsList = ET.SubElement(resource, "rightsList") + for rights_entry in data['dc:rights']: + if rights_entry != '': + rights = ET.SubElement(rightsList, "rights") + rights.text = xml_fix_text(rights_entry) + if "http" in rights_entry: + rights.set("rightsURI", rights_entry[rights_entry.find("http"):].strip()) + rights.text = xml_fix_text(rights_entry[:rights_entry.find("http")].strip()) + # Add access statement + rights = ET.SubElement(rightsList, "rights") + if len(data["frdr:access"]) > 0: + for access_entry in data["frdr:access"]: + # If Public in frdr:access, use openAccess + if access_entry == "Public": + rights.set("rightsURI", "info:eu-repo/semantics/openAccess") + break + if "rightsURI" not in rights.attrib: + # If there are access values and none are Public, use restrictedAccess + rights.set("rightsURI", "info:eu-repo/semantics/restrictedAccess") + else: + # If not indicated, assume Public/openAccess + rights.set("rightsURI", "info:eu-repo/semantics/openAccess") + + # Add description(s) + descriptions = ET.SubElement(resource, "descriptions") + for description_entry in data['dc:description_en']: + if description_entry != "": + description = ET.SubElement(descriptions, "description") + description.set("descriptionType", "Abstract") + description.set("xml:lang", "en") + description.text = xml_fix_text(description_entry) + for description_entry in data['dc:description_fr']: + if description_entry != "": + description = ET.SubElement(descriptions, "description") + description.set("descriptionType", "Abstract") + description.set("xml:lang", "fr") + description.text = xml_fix_text(description_entry) + + # Add series (series) + if data['series'] != "": + description_series = ET.SubElement(descriptions, "description") + description_series.set("descriptionType", "SeriesInformation") + description_series.text = xml_fix_text(data['series']) + + # If descriptions is empty, remove it + if len(descriptions) == 0: + resource.remove(descriptions) + + # Add GeoLocation + geolocations = ET.SubElement(resource, "geoLocations") + if "geoLocationBox" in data["datacite_geoLocation"]: + for geobbox in data["datacite_geoLocation"]["geoLocationBox"]: + geolocation = ET.SubElement(geolocations, "geoLocation") + geolocationBox = ET.SubElement(geolocation, "geolocationBox") + geolocationBox.text = xml_fix_text(str(geobbox["southBoundLatitude"]) + " " + str(geobbox["westBoundLongitude"]) + " " + + str(geobbox["northBoundLatitude"]) + " " + str(geobbox["eastBoundLongitude"])) + if "geoLocationPoint" in data["datacite_geoLocation"]: + for geopoint in data["datacite_geoLocation"]["geoLocationPoint"]: + geolocation = ET.SubElement(geolocations, "geoLocation") + geoLocationPoint = ET.SubElement(geolocation, "geoLocationPoint") + geoLocationPoint.text = xml_fix_text(str(geopoint["pointLatitude"]) + " " + str(geopoint["pointLongitude"])) + if "geoLocationPlace" in data["datacite_geoLocation"]: + for geoplace in data["datacite_geoLocation"]["geoLocationPlace"]: + geolocation = ET.SubElement(geolocations, "geoLocation") + geoLocationPlace = ET.SubElement(geolocation, "geoLocationPlace") + components = [] + if geoplace["place_name"]: + components.append(geoplace["place_name"]) + if geoplace["additional"]: + components.append(geoplace["additional"]) + if geoplace["city"]: + components.append(geoplace["city"]) + if geoplace["province_state"]: + components.append(geoplace["province_state"]) + if geoplace["country"]: + components.append(geoplace["country"]) + # Combine all components of the place name separated by "; " + geoLocationPlace.text = xml_fix_text("; ".join(components)) + + # If geolocations is empty, remove it + if len(geolocations) == 0: + resource.remove(geolocations) + + xml_string = ET.tostring(resource) + return xml_string + +def build_metadata(data): + """Parse single FRDR result into metadata object""" + result = Metadata() + + # Construct identifier compliant with OAI spec + namespace = data['repo_oai_name'] + result.identifier = "oai:" + namespace + ":" + data['local_identifier'] + + # Here we want to parse a ISO date but convert to UTC and then remove the TZinfo entirely + # This is because OAI always works in UTC. + created = dateutil.parser.parse(data['pub_date']) + result.created_datetime = created.astimezone(dateutil.tz.UTC).replace(tzinfo=None) + updated = dateutil.parser.parse(data['pub_date']) + result.updated_datetime = updated.astimezone(dateutil.tz.UTC).replace(tzinfo=None) + + result.xml = construct_datacite_xml(data) + result.metadata_version = None + result.titles = [data['title_en'], data['title_fr']] + result.creators = data['dc:contributor.author'] + result.subjects = [] + + # De-duplicate subjects and tags + for subject in data['frdr:category_en'] + data['frdr:category_fr'] + data['frdr:keywords_en'] + data['frdr:keywords_fr']: + if subject not in result.subjects: + result.subjects.append(subject) + + result.descriptions = data['dc:description_en'] + data['dc:description_fr'] + result.publisher = data['repository_name'] + result.publication_year = dateutil.parser.parse(data['pub_date']).year + result.dates = [data['pub_date']] + result.contributors = data['dc:contributor'] + result.funding_references = '' + result.sizes = [] + result.geo_locations = [] + result.resource_types = ['Dataset'] + result.formats = [] + result.identifiers = [data['item_url']] + result.language = '' + result.relations = [] + result.rights = data['dc:rights'] + result.client = data['repo_oai_name'] + result.active = True + + # Add openAccess or restrictedAccess indicator to dc:rights + if len(data["frdr:access"]) > 0: + for access_entry in data["frdr:access"]: + # If Public in frdr:access, use openAccess + if access_entry == "Public": + result.rights.append("openAccess") + break + if "openAccess" not in result.rights: + # If there are access values and none are Public, use restrictedAccess + result.rights.append("restrictedAccess") + else: + # If not indicated, assume Public/openAccess + result.rights.append("openAccess") + + return result + + +def rows_to_dict(cursor): + newdict = [] + if cursor: + for r in cursor: + if r: + if isinstance(r, list): + newdict.append(r[0]) + else: + newdict.append(r) + return newdict + + +def assemble_record(record, db, user, password, server, port): + + if record["item_url"] is None: + return None + + if int(record["deleted"]) == 1: + return None + + if (len(record['title_en']) == 0 and len(record['title_fr']) == 0): + return None + + con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s' port='%s'" % (db, user, password, server, port)) + with con: + from psycopg2.extras import DictCursor + lookup_cur = con.cursor(cursor_factory=DictCursor) + + # get geolocation metadata + record["datacite_geoLocation"] = {} + lookup_cur.execute("""SELECT geobbox.westLon, geobbox.eastLon, geobbox.northLat, geobbox.southLat + FROM geobbox WHERE geobbox.record_uuid=%s""", [record["record_uuid"]]) + geobboxes = lookup_cur.fetchall() + if len(geobboxes) > 0: + record["datacite_geoLocation"]["geoLocationBox"] = [] + for geobbox in geobboxes: + record["datacite_geoLocation"]["geoLocationBox"].append({"westBoundLongitude": geobbox["westlon"], + "eastBoundLongitude": geobbox["eastlon"], + "northBoundLatitude": geobbox["northlat"], + "southBoundLatitude": geobbox["southlat"]}) + lookup_cur.execute("""SELECT geopoint.lat, geopoint.lon FROM geopoint WHERE geopoint.record_uuid=%s""", + [record["record_uuid"]]) + geopoints = lookup_cur.fetchall() + if len(geopoints) > 0: + record["datacite_geoLocation"]["geoLocationPoint"] = [] + for geopoint in geopoints: + record["datacite_geoLocation"]["geoLocationPoint"].append({"pointLatitude": geopoint["lat"], + "pointLongitude": geopoint["lon"]}) + + lookup_cur.execute("""SELECT geoplace.country, geoplace.province_state, geoplace.city, geoplace.other, geoplace.place_name + FROM geoplace JOIN records_x_geoplace on records_x_geoplace.geoplace_id = geoplace.geoplace_id + WHERE records_x_geoplace.record_uuid=%s""", [record["record_uuid"]]) + geoplaces = lookup_cur.fetchall() + if len(geoplaces) > 0: + record["datacite_geoLocation"]["geoLocationPlace"] = [] + for geoplace in geoplaces: + record["datacite_geoLocation"]["geoLocationPlace"].append({"country": geoplace["country"], + "province_state": geoplace["province_state"], + "city": geoplace["city"], + "additional": geoplace["other"], + "place_name": geoplace["place_name"]}) + + # attach the other values to the dict + lookup_cur.execute("""SELECT creators.creator FROM creators JOIN records_x_creators on records_x_creators.creator_id = creators.creator_id WHERE records_x_creators.record_uuid=%s AND records_x_creators.is_contributor=0 order by records_x_creators_id asc""", [record["record_uuid"]]) + record["dc:contributor.author"] = rows_to_dict(lookup_cur) + + lookup_cur.execute("""SELECT affiliations.affiliation FROM affiliations JOIN records_x_affiliations on records_x_affiliations.affiliation_id = affiliations.affiliation_id WHERE records_x_affiliations.record_uuid=%s""", [record["record_uuid"]]) + record["datacite:creatorAffiliation"] = rows_to_dict(lookup_cur) + + lookup_cur.execute("""SELECT creators.creator FROM creators JOIN records_x_creators on records_x_creators.creator_id = creators.creator_id WHERE records_x_creators.record_uuid=%s AND records_x_creators.is_contributor=1 order by records_x_creators_id asc""", [record["record_uuid"]]) + record["dc:contributor"] = rows_to_dict(lookup_cur) + + lookup_cur.execute("""SELECT subjects.subject FROM subjects JOIN records_x_subjects on records_x_subjects.subject_id = subjects.subject_id WHERE records_x_subjects.record_uuid=%s and subjects.language = 'en' """, [record["record_uuid"]]) + record["frdr:category_en"] = rows_to_dict(lookup_cur) + + lookup_cur.execute("""SELECT subjects.subject FROM subjects JOIN records_x_subjects on records_x_subjects.subject_id = subjects.subject_id WHERE records_x_subjects.record_uuid=%s and subjects.language = 'fr' """, [record["record_uuid"]]) + record["frdr:category_fr"] = rows_to_dict(lookup_cur) + + lookup_cur.execute("""SELECT publishers.publisher FROM publishers JOIN records_x_publishers on records_x_publishers.publisher_id = publishers.publisher_id WHERE records_x_publishers.record_uuid=%s""", [record["record_uuid"]]) + record["dc:publisher"] = rows_to_dict(lookup_cur) + + lookup_cur.execute("""SELECT rights.rights FROM rights JOIN records_x_rights on records_x_rights.rights_id = rights.rights_id WHERE records_x_rights.record_uuid=%s""", [record["record_uuid"]]) + record["dc:rights"] = rows_to_dict(lookup_cur) + + lookup_cur.execute("SELECT description FROM descriptions WHERE record_uuid=%s and language='en' ", [record["record_uuid"]]) + record["dc:description_en"] = rows_to_dict(lookup_cur) + + lookup_cur.execute("SELECT description FROM descriptions WHERE record_uuid=%s and language='fr' ", [record["record_uuid"]]) + record["dc:description_fr"] = rows_to_dict(lookup_cur) + + lookup_cur.execute("""SELECT tags.tag FROM tags JOIN records_x_tags on records_x_tags.tag_id = tags.tag_id WHERE records_x_tags.record_uuid=%s and tags.language = 'en' """, [record["record_uuid"]]) + record["frdr:keywords_en"] = rows_to_dict(lookup_cur) + + lookup_cur.execute("""SELECT tags.tag FROM tags JOIN records_x_tags on records_x_tags.tag_id = tags.tag_id WHERE records_x_tags.record_uuid=%s and tags.language = 'fr' """, [record["record_uuid"]]) + record["frdr:keywords_fr"] = rows_to_dict(lookup_cur) + + lookup_cur.execute("""SELECT access.access FROM access JOIN records_x_access on records_x_access.access_id = access.access_id WHERE records_x_access.record_uuid=%s""", [record["record_uuid"]]) + record["frdr:access"] = rows_to_dict(lookup_cur) + + return record + + +def get_metadata_list( + server, + db, + user, + password, + port, + query=None, + set=None, + from_datetime=None, + until_datetime=None, + cursor=None + ): + + records_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s' port='%s'" % (db, user, password, server, port)) + with records_con: + db_cursor = records_con.cursor() + records_sql = """SELECT recs.record_uuid, recs.title, recs.title_fr, recs.pub_date, recs.series, recs.source_url, + recs.item_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, + repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp, + repos.homepage_url, repos.repo_oai_name, count(*) OVER() AS full_count FROM records recs, repositories repos + WHERE recs.repository_id = repos.repository_id AND recs.deleted!=1 AND recs.item_url!='' AND + recs.pub_date != ''""" + if set is not None and set != 'openaire_data': + records_sql = records_sql + " AND (repos.repo_oai_name='" + set + "')" + if from_datetime is not None: + from_timestamp = int(datetime.timestamp(from_datetime)) + records_sql = records_sql + " AND recs.upstream_modified_timestamp>=" + str(from_timestamp) + if until_datetime is not None: + until_timestamp = int(datetime.timestamp(until_datetime)) + records_sql = records_sql + " AND recs.upstream_modified_timestamp<" + str(until_timestamp) + records_sql = records_sql + " ORDER BY recs.record_uuid" + if cursor is not None: + records_sql = records_sql + " OFFSET " + cursor + db_cursor.execute(records_sql) + + record_set = db_cursor.fetchmany(config.RESULT_SET_SIZE) + full_count = 0 + + results = [] + for row in record_set: + record = (dict(zip(['record_uuid', 'title_en', 'title_fr', 'pub_date', 'series', 'source_url', 'item_url', 'deleted', 'local_identifier', 'modified_timestamp', 'repository_url', 'repository_name', 'repository_thumbnail', 'item_url_pattern', 'last_crawl_timestamp', 'homepage_url', 'repo_oai_name'], row))) + + # This is goofy, but full_count isn't always returned for empty results + if int(row[-1]) != 0: + full_count = row[-1] + + full_record = assemble_record(record, db, user, password, server, port) + if full_record is not None: + results.append(build_metadata(full_record)) + + if cursor is not None: + return results, full_count, (len(record_set) + int(cursor)) + else: + return results, full_count, len(record_set) + + +def get_metadata(identifier, db, user, password, server, port): + identifier = identifier[4:] + namespace = identifier[:identifier.find(":")] + local_identifier = identifier[identifier.find(":")+1:] + records_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s' port='%s'" % (db, user, password, server, port)) + with records_con: + records_cursor = records_con.cursor() + records_sql = ("""SELECT recs.record_uuid, recs.title, recs.title_fr, recs.pub_date, recs.series, recs.source_url, + recs.item_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, + repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp, + repos.homepage_url, repos.repo_oai_name FROM records recs, repositories repos + WHERE recs.repository_id = repos.repository_id + AND recs.local_identifier =\'""" + local_identifier + "\'" + "AND repos.repo_oai_name=\'""" + namespace + "\'") + records_cursor.execute(records_sql) + row = records_cursor.fetchone() + record = (dict(zip(['record_uuid', 'title_en', 'title_fr', 'pub_date', 'series', 'source_url', 'item_url', 'deleted', 'local_identifier', 'modified_timestamp', 'repository_url', 'repository_name', 'repository_thumbnail', 'item_url_pattern', 'last_crawl_timestamp', 'homepage_url', 'repo_oai_name'], row))) + + full_record = assemble_record(record, db, user, password, server, port) + return build_metadata(full_record) + + +def get_sets(db, user, password, server, port): + repos_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s' port='%s'" % (db, user, password, server, port)) + with repos_con: + repos_cursor = repos_con.cursor() + + results = [] + results.append(['openaire_data', 'OpenAIRE']) + + repos_cursor.execute("SELECT repo_oai_name, repository_name from repositories") + results.extend(repos_cursor.fetchall()) + + return results, len(results) \ No newline at end of file