diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..71b81bcb --- /dev/null +++ b/.gitignore @@ -0,0 +1,22 @@ +# Mac OS variable +.DS_Store + + +# Doc build +_build/ +build/ + + +# Package +*.egg-info/ +dist/ + + +# Python cache +__pycache__/ +*.pyc +.pytest_cache/ + + +# Notebooks checkpoints +.ipynb_checkpoints/ diff --git a/AUTHORS.rst b/AUTHORS.rst new file mode 100644 index 00000000..646af0ab --- /dev/null +++ b/AUTHORS.rst @@ -0,0 +1,16 @@ +======= +Credits +======= + +Development Lead (Quantmetry Team support Maif Team) +---------------------------------------------------- + +* Sacha Samama +* Tom Stringer +* Antoine Simoulin +* Benoit Lebreton + +Contributors +------------ + +None yet. Why not be the first? diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst new file mode 100644 index 00000000..ffb86cb6 --- /dev/null +++ b/CONTRIBUTING.rst @@ -0,0 +1,128 @@ +.. highlight:: shell + +============ +Contributing +============ + +Contributions are welcome, and they are greatly appreciated! Every little bit +helps, and credit will always be given. + +You can contribute in many ways: + +Types of Contributions +---------------------- + +Report Bugs +~~~~~~~~~~~ + +Report bugs at https://github.com/MAIF/melusine/issues. + +If you are reporting a bug, please include: + +* Your operating system name and version. +* Any details about your local setup that might be helpful in troubleshooting. +* Detailed steps to reproduce the bug. + +Fix Bugs +~~~~~~~~ + +Look through the GitHub issues for bugs. Anything tagged with "bug" and "help +wanted" is open to whoever wants to implement it. + +Implement Features +~~~~~~~~~~~~~~~~~~ + +Look through the GitHub issues for features. Anything tagged with "enhancement" +and "help wanted" is open to whoever wants to implement it. + +Write Documentation +~~~~~~~~~~~~~~~~~~~ + +Amabie could always use more documentation, whether as part of the +official melusine docs, in docstrings, or even on the web in blog posts, +articles, and such. + +Submit Feedback +~~~~~~~~~~~~~~~ + +The best way to send feedback is to file an issue at https://github.com/MAIF/melusine/issues. + +If you are proposing a feature: + +* Explain in detail how it would work. +* Keep the scope as narrow as possible, to make it easier to implement. +* Remember that this is a volunteer-driven project, and that contributions + are welcome :) + +Get Started! +------------ + +Ready to contribute? Here's how to set up `melusine` for local development. + +1. Fork the `melusine` repo on GitHub. +2. Clone your fork locally:: + + $ git clone git@github.com:your_name_here/melusine.git + +3. Install your local copy into a virtualenv. Assuming you have virtualenvwrapper installed, this is how you set up your fork for local development:: + + $ mkvirtualenv melusine + $ cd melusine/ + $ python setup.py develop + +4. Create a branch for local development:: + + $ git checkout -b name-of-your-bugfix-or-feature + + Now you can make your changes locally. + +5. When you're done making changes, check that your changes pass flake8 and the + tests, including testing other Python versions with tox:: + + $ flake8 melusine tests + $ python setup.py test or py.test + $ tox + + To get flake8 and tox, just pip install them into your virtualenv. + +6. Commit your changes and push your branch to GitHub:: + + $ git add . + $ git commit -m "Your detailed description of your changes." + $ git push origin name-of-your-bugfix-or-feature + +7. Submit a pull request through the GitHub website. + +Pull Request Guidelines +----------------------- + +Before you submit a pull request, check that it meets these guidelines: + +1. The pull request should include tests. +2. If the pull request adds functionality, the docs should be updated. Put + your new functionality into a function with a docstring, and add the + feature to the list in README.rst. +3. The pull request should work for Python 2.7, 3.4, 3.5 and 3.6, and for PyPy. Check + https://travis-ci.org/sachasamama/melusine/pull_requests + and make sure that the tests pass for all supported Python versions. + +Tips +---- + +To run a subset of tests:: + +$ py.test tests.test_melusine + + +Deploying +--------- + +A reminder for the maintainers on how to deploy. +Make sure all your changes are committed (including an entry in HISTORY.rst). +Then run:: + +$ bumpversion patch # possible: major / minor / patch +$ git push +$ git push --tags + +Travis will then deploy to PyPI if tests pass. diff --git a/HISTORY.rst b/HISTORY.rst new file mode 100644 index 00000000..cf194ef5 --- /dev/null +++ b/HISTORY.rst @@ -0,0 +1,16 @@ +======= +History +======= + +0.0.2 (2019-02-12) +------------------ + +* Second release on PyPI. +* Alpha project version + + +0.0.1 (2019-01-15) +------------------ + +* First release on PyPI. +* First setup of the project diff --git a/LICENSE b/LICENSE index 261eeb9e..7c6d5a14 100644 --- a/LICENSE +++ b/LICENSE @@ -1,201 +1,15 @@ - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ +Apache Software License 2.0 - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION +Copyright (c) 2019, Maif & Quantmetry - 1. Definitions. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. +http://www.apache.org/licenses/LICENSE-2.0 - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 00000000..965b2dda --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,11 @@ +include AUTHORS.rst +include CONTRIBUTING.rst +include HISTORY.rst +include LICENSE +include README.rst + +recursive-include tests * +recursive-exclude * __pycache__ +recursive-exclude * *.py[co] + +recursive-include docs *.rst conf.py Makefile make.bat *.jpg *.png *.gif diff --git a/Makefile b/Makefile new file mode 100644 index 00000000..ae41e7af --- /dev/null +++ b/Makefile @@ -0,0 +1,88 @@ +.PHONY: clean clean-test clean-pyc clean-build docs help +.DEFAULT_GOAL := help + +define BROWSER_PYSCRIPT +import os, webbrowser, sys + +try: + from urllib import pathname2url +except: + from urllib.request import pathname2url + +webbrowser.open("file://" + pathname2url(os.path.abspath(sys.argv[1]))) +endef +export BROWSER_PYSCRIPT + +define PRINT_HELP_PYSCRIPT +import re, sys + +for line in sys.stdin: + match = re.match(r'^([a-zA-Z_-]+):.*?## (.*)$$', line) + if match: + target, help = match.groups() + print("%-20s %s" % (target, help)) +endef +export PRINT_HELP_PYSCRIPT + +BROWSER := python -c "$$BROWSER_PYSCRIPT" + +help: + @python -c "$$PRINT_HELP_PYSCRIPT" < $(MAKEFILE_LIST) + +clean: clean-build clean-pyc clean-test ## remove all build, test, coverage and Python artifacts + +clean-build: ## remove build artifacts + rm -fr build/ + rm -fr dist/ + rm -fr .eggs/ + find . -name '*.egg-info' -exec rm -fr {} + + find . -name '*.egg' -exec rm -f {} + + +clean-pyc: ## remove Python file artifacts + find . -name '*.pyc' -exec rm -f {} + + find . -name '*.pyo' -exec rm -f {} + + find . -name '*~' -exec rm -f {} + + find . -name '__pycache__' -exec rm -fr {} + + +clean-test: ## remove test and coverage artifacts + rm -fr .tox/ + rm -f .coverage + rm -fr htmlcov/ + rm -fr .pytest_cache + +lint: ## check style with flake8 + flake8 melusine tests + +test: ## run tests quickly with the default Python + py.test + +test-all: ## run tests on every Python version with tox + tox + +coverage: ## check code coverage quickly with the default Python + coverage run --source melusine -m pytest + coverage report -m + coverage html + $(BROWSER) htmlcov/index.html + +docs: ## generate Sphinx HTML documentation, including API docs + rm -f docs/melusine.rst + rm -f docs/modules.rst + sphinx-apidoc -o docs/ melusine + $(MAKE) -C docs clean + $(MAKE) -C docs html + $(BROWSER) docs/_build/html/index.html + +servedocs: docs ## compile the docs watching for changes + watchmedo shell-command -p '*.rst' -c '$(MAKE) -C docs html' -R -D . + +release: dist ## package and upload a release + twine upload dist/* + +dist: clean ## builds source and wheel package + python setup.py sdist + python setup.py bdist_wheel + ls -l dist + +install: clean ## install the package to the active Python's site-packages + python setup.py install diff --git a/README.md b/README.md new file mode 100644 index 00000000..8ca0f301 --- /dev/null +++ b/README.md @@ -0,0 +1,318 @@ +# Melusine + + +![](docs/_static/melusine.png) + +[![pypi badge](https://img.shields.io/pypi/v/melusine.svg)](https://pypi.python.org/pypi/melusine) +[![](https://img.shields.io/travis/sachasamama/melusine.svg)](https://travis-ci.org/sachasamama/melusine) +[![documentation badge](https://readthedocs.org/projects/melusine/badge/?version=latest)](https://readthedocs.org/projects/melusine/) + +- Free software: Apache Software License 2.0 +- Documentation: [https://melusine.readthedocs.io](https://melusine.readthedocs.io). + +# Overview + +**Melusine** is a high-level Scikit-Learn API for emails classification and feature extraction, +written in Python and capable of running on top of Scikit-Learn, Keras or Tensorflow. +It was developed with a focus on emails written in french. + +Use **Melusine** if you need a library which : + * Supports both convolutional networks and recurrent networks, as well as combinations of the two. + * Runs seamlessly on CPU and GPU. + +**Melusine** is compatible with `Python >= 3.5`. + + +## The Melusine package + +This package is designed for the preprocessing, classification and automatic summarization of emails written in french. + + +![](docs/_static/schema_1.png =700x) + +**3 main subpackages are offered :** + +* ``prepare_email`` : to preprocess and clean the emails. +* ``summarizer`` : to extract keywords from an email. +* ``models`` : to classify e-mails according to categories pre-defined defined by the user. + +**2 other subpackages are offered as building blocks :** + +* ``nlp_tools`` : to provide classic NLP tools such as tokenizer, phraser and embeddings. +* ``utils`` : to provide a *TransformerScheduler* class to build your own transformer and integrate it into a scikit-learn Pipeline. + +**An other subpackage is also provided** to manage, modify or add parameters such as : regular expressions, keywords, stopwords, etc. + +* ``config`` : contains *`ConfigJsonReader`* class to setup and handle a *conf.json* file. This JSON file is the core of this package since it's used by different submodules to preprocess the data. + + +## Getting started: 30 seconds to Melusine + +### Installation + +``` +pip install melusine +``` + +To use Melusine in a project + +```python +import melusine +``` + +### Input data : Email DataFrame + +The basic requirement to use Melusine is to have an input e-mail DataFrame with the following columns: + +- *body* : Body of an email (single message or conversation historic) +- *header* : Header/Subject of an email +- *date* : Reception date of an email +- *from* : Email address of the sender +- *to* : Email address of the recipient +- *label* (optional): Label of the email for a classification task (examples: Business, Spam, Finance or Family) + +| body | header | date | from | to | label | +|:---------------------------|:--------------:|:------------------------------:|:----------------------------:|:-------------------------------------:|:-------:| +| Thank you.\\nBye,\\nJohn | Re: Your order | jeudi 24 mai 2018 11 h 49 CEST | anonymous.sender@unknown.com | anonymous.recipient@unknown.fr | label_1 | + +### Pre-processing pipeline + +A working pre-processing pipeline is given below:: + +```python +from sklearn.pipeline import Pipeline +from melusine.utils.transformer_scheduler import TransformerScheduler +from melusine.prepare_email.manage_transfer_reply import check_mail_begin_by_transfer +from melusine.prepare_email.manage_transfer_reply import update_info_for_transfer_mail +from melusine.prepare_email.manage_transfer_reply import add_boolean_answer +from melusine.prepare_email.manage_transfer_reply import add_boolean_transfer +from melusine.prepare_email.build_historic import build_historic +from melusine.prepare_email.mail_segmenting import structure_email +from melusine.prepare_email.body_header_extraction import extract_last_body +from melusine.prepare_email.cleaning import clean_body +from melusine.prepare_email.cleaning import clean_header + +ManageTransferReply = TransformerScheduler( +functions_scheduler=[ + (check_mail_begin_by_transfer, None, ['is_begin_by_transfer']), + (update_info_for_transfer_mail, None, None), + (add_boolean_answer, None, ['is_answer']), + (add_boolean_transfer, None, ['is_transfer']) +]) + +EmailSegmenting = TransformerScheduler( +functions_scheduler=[ + (build_historic, None, ['structured_historic']), + (structure_email, None, ['structured_body']) +]) + +Cleaning = TransformerScheduler( +functions_scheduler=[ + (extract_last_body, None, ['last_body']), + (clean_body, None, ['clean_body']), + (clean_header, None, ['clean_header']) +]) + +prepare_data_pipeline = Pipeline([ + ('ManageTransferReply', ManageTransferReply), + ('EmailSegmenting', EmailSegmenting), + ('Cleaning', Cleaning), +]) + +df_email = prepare_data_pipeline.fit_transform(df_email) +``` + +In this example, the pre-processing functions applied are: + +- ``check_mail_begin_by_transfer`` : Email is a direct transfer (True/False) +- ``update_info_for_transfer_mail`` : Update body, header, from, to, date if direct transfer +- ``add_boolean_answer`` : Email is an answer (True/False) +- ``add_boolean_transfer`` : Email is transferred (True/False) +- ``build_historic`` : When email is a conversation, reconstructs the individual message historic +- ``structure_email`` : Splits parts of each messages in historic and tags them (tags: Hello, Body, Greetings, etc) + +### Phraser and Tokenizer pipeline + +A pipeline to train and apply the phraser end tokenizer is given below:: + +```python +from melusine.nlp_tools.phraser import Phraser +from melusine.nlp_tools.tokenizer import Tokenizer + +phraser = Phraser(columns='clean_body') +phraser.train(df_email) +phraser.save('./phraser.pkl') +phraser = Phraser().load('./phraser.pkl') + +PhraserTransformer = TransformerScheduler( +functions_scheduler=[ + (phraser_on_body, (phraser,), ['clean_body']), + (phraser_on_header, (phraser,), ['clean_header']) +]) + +phraser_tokenizer_pipeline = Pipeline([ + ('PhraserTransformer', PhraserTransformer), + ('Tokenizer', Tokenizer(columns=['clean_body', 'clean_header'])) +]) + +df_email = phraser_tokenizer_pipeline.fit_transform(df_email) +``` + +### Embeddings training + +An example of embedding training is given below:: + +```python +from melusine.nlp_tools.embedding import Embedding + +embedding = Embedding(columns='clean_body') +embedding.train(df_email) +embedding.save('./embedding.pkl') +``` + +### Metadata pipeline + +A pipeline to prepare the metadata is given below: + +```python +from melusine.prepare_email.metadata_engineering import MetaExtension +from melusine.prepare_email.metadata_engineering import MetaDate +from melusine.prepare_email.metadata_engineering import Dummifier + +metadata_pipeline = Pipeline([ + ('MetaExtension', MetaExtension()), + ('MetaDate', MetaDate()), + ('Dummifier', Dummifier(columns_to_dummify=['extension', 'dayofweek', 'hour'])) +]) + +df_meta = metadata_pipeline.fit_transform(df_email) +``` + +### Keywords extraction + +An example of keywords extraction is given below:: + +```python +from melusine.summarizer.keywords_generator import KeywordsGenerator + +keywords_generator = KeywordsGenerator() +df_email = phraser_tokenizer_pipeline.fit_transform(df_email) +``` + +### Classification + +An example of classification is given below:: +```python +from sklearn.preprocessing import LabelEncoder +from melusine.nlp_tools.embedding import Embedding +from melusine.models.neural_architectures import cnn_model +from melusine.models.train import NeuralModel + +X = df_email.drop(['label'], axis=1) +y = df_email.label + +le = LabelEncoder() +y = le.fit_transform(y) + +pretrained_embedding = Embedding().load(./embedding.pkl) + +nn_model = NeuralModel(neural_architecture_function=cnn_model, + pretrained_embedding=pretrained_embedding) +nn_model.fit(X, y) +y_res = nn_model.transform(X_test) +``` + +## Glossary + +### Pandas dataframes columns + +Because Melusine manipulates pandas dataframes, the naming of the columns is imposed. +Here is a basic glossary to provide an understanding of each columns manipulated. +Initial columns of the dataframe: + +* **body :** the body of the email. It can be composed of a unique message, a historic of messages, a transfer of messages or a combination of historics and transfers. +* **header :** the subject of the email. +* **date :** the date the email has been sent. It corresponds to the date of the last message of the email has been written. +* **from :** the email address of the author of the last message of the email. +* **to :** the email address of the recipient of the last message. + +Columns added by Melusine: + +* **is_begin_by_transfer :** boolean, indicates if the email is a direct transfer. In that case it is recommended to update the value of the initial columns with the informations of the message transferred. +* **is_answer :** boolean, indicates if the email contains a historic of messages +* **is_transfer :** boolean, indicates if the email is a transfer (in that case it does not have to be a direct transfer). +* **structured_historic :** list of dictionaries, each dictionary corresponds to a message of the email. The first dictionary corresponds to the last message (the one that has been written) while the last dictionary corresponds to the first message of the historic. Each dictionary has two keys : + + - *meta :* to access the metadata of the message as a string. + - *text :* to access the message itself as a string. + +* **structured_body :** list of dictionaries, each dictionary corresponds to a message of the email. The first dictionary corresponds to the last message (the one that has been written) while the last dictionary corresponds to the first message of the historic. Each dictionary has two keys : + + - *meta :* to access the metadata of the message as a dictionary. The dictionary has three keys: + + *date :* the date of the message. + + *from :* the email address of the author of the message. + + *to :* the email address of the recipient of the message. + + - *text :* to access the message itself as a dictionary. The dictionary has two keys: + + *header :* the subject of the message. + + *structured_text :* the different parts of the message segmented and tagged as a list of dictionaries. Each dictionary has two keys: + - *part :* to access the part of the message as a string. + - *tags :* to access the tag of the part of the message. + + +* **last_body :** string, corresponds to the part of the last message of the email that has been tagged as `BODY`. +* **clean_body :** string, corresponds a cleaned last_body. +* **clean_header :** string, corresponds to a cleaned header. +* **clean_text :** string, concatenation of clean_header and clean_body. +* **tokens :** list of strings, corresponds to a tokenized column, by default clean_text. +* **keywords :** list of strings, corresponds to the keywords of extracted from the tokens column. + +### Tags + +Each messages of an email are segmented the in the **structured_body** columns and each parts are assigned a tag: + +* `RE/TR` : any metadata such as date, from, to etc. +* `DISCLAIMER` : any disclaimer such as `L'émetteur décline toute responsabilité...`. +* `GREETINGS` : any greetings such as `Salutations`. +* `PJ` : any indication of an attached document such as `See attached file...`. +* `FOOTER` : any footer such as `Provenance : Courrier pour Windows`. +* `HELLO` : any salutations such as `Bonjour,`. +* `THANKS` : any thanks such as `Avec mes remerciements` +* `BODY` : the core of the the message which contains the valuable information. + + +## Motivation & history + + +### Origin of the project + +**MAIF**, being one of the leading mutual insurance companies in France, receives daily a large volume of emails from its clients +and is under pressure to reply to their requests as efficiently as possible. As such an efficient routing system is of the +upmost importance to assign each emails to its right entity. +However the previously outdated routing system put the company under ever increasing difficulties to fulfill its pledge. +In order to face up to this challenge, MAIF in collaboration with **Quantmetry**, has implemented a new routing system +based on state-of-the-art NLP and Deep Learning techniques that would classify each email under the right label +according to its content and extract the relevant information to help the MAIF counsellors processing the emails. + +### Ambitions of the project + +**Melusine** is the first Open Source and free-of-use solution dedicated specifically to the qualification of e-mails written in french. +The ambition of this Python package is to become a reference, but also to live in the French NLP community by federating users and contributors. +Initially developed to answer the problem of routing e-mails received by the MAIF, the solution was implemented using state-of-the-art techniques in Deep Learning and NLP. +Melusine can be interfaced with Scikit-Learn: it offers the user the possibility to train his own classification and automatic summarization model according to the constraints of his problem. + +### The collaboration between Quantmetry and MAIF + +After collaborating for the implementation of its routing system with Quantmetry, +a pure player consulting firm in AI, MAIF pursued the partnership to develop the *Melusine* package. + +### Why Melusine ? + +Following MAIF's tradition to name its open source packages after deities, it was chosen to release this package +under the name of Melusine as an homage to a legend from the local folklore in the Poitou region in France +where MAIF is historically based. + + +## Credits + +This package was created with [Cookiecutter](https://github.com/audreyr/cookiecutter) and the [audreyr/cookiecutter-pypackage](https://github.com/audreyr/cookiecutter-pypackage) project template. diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 00000000..59705295 --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = python -msphinx +SPHINXPROJ = melusine +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/_static/logo_commun.png b/docs/_static/logo_commun.png new file mode 100644 index 00000000..b061e285 Binary files /dev/null and b/docs/_static/logo_commun.png differ diff --git a/docs/_static/maif.png b/docs/_static/maif.png new file mode 100644 index 00000000..e25170a2 Binary files /dev/null and b/docs/_static/maif.png differ diff --git a/docs/_static/melusine.png b/docs/_static/melusine.png new file mode 100644 index 00000000..04c6eb1a Binary files /dev/null and b/docs/_static/melusine.png differ diff --git a/docs/_static/quantmetry.png b/docs/_static/quantmetry.png new file mode 100644 index 00000000..64c879e6 Binary files /dev/null and b/docs/_static/quantmetry.png differ diff --git a/docs/_static/schema_1.png b/docs/_static/schema_1.png new file mode 100644 index 00000000..d4d12f41 Binary files /dev/null and b/docs/_static/schema_1.png differ diff --git a/docs/_static/schema_2.png b/docs/_static/schema_2.png new file mode 100644 index 00000000..dfed0bf6 Binary files /dev/null and b/docs/_static/schema_2.png differ diff --git a/docs/api.rst b/docs/api.rst new file mode 100644 index 00000000..66f70f8b --- /dev/null +++ b/docs/api.rst @@ -0,0 +1,12 @@ +Melusine API +============ + +.. toctree:: + :maxdepth: 2 + + melusine.utils + melusine.prepare_email + melusine.nlp_tools + melusine.summarizer + melusine.models + melusine.config diff --git a/docs/authors.rst b/docs/authors.rst new file mode 100644 index 00000000..e122f914 --- /dev/null +++ b/docs/authors.rst @@ -0,0 +1 @@ +.. include:: ../AUTHORS.rst diff --git a/docs/conf.py b/docs/conf.py new file mode 100755 index 00000000..76f28639 --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,236 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# melusine documentation build configuration file, created by +# sphinx-quickstart on Fri Jun 9 13:47:02 2017. +# +# This file is execfile()d with the current directory set to its +# containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +# If extensions (or modules to document with autodoc) are in another +# directory, add these directories to sys.path here. If the directory is +# relative to the documentation root, use os.path.abspath to make it +# absolute, like shown here. +# +import os +import sys +sys.path.insert(0, os.path.abspath('..')) + +import melusine +import warnings +warnings.filterwarnings("ignore", message="numpy.dtype size changed") +warnings.filterwarnings("ignore", message="numpy.ufunc size changed") +# import sphinx_bootstrap_theme + +# -- General configuration --------------------------------------------- + +# If your documentation needs a minimal Sphinx version, state it here. +# +# needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones. +extensions = ['sphinx.ext.autodoc', 'sphinx.ext.viewcode', 'numpydoc'] +numpydoc_show_class_members = False + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +# +# source_suffix = ['.rst', '.md'] +source_suffix = '.rst' + +# The master toctree document. +master_doc = 'index' + +# General information about the project. +project = u'Melusine' +copyright = u"Maif & Quantmetry" +author = u"Sacha Samama & Tom Stringer" + +# The version info for the project you're documenting, acts as replacement +# for |version| and |release|, also used in various other places throughout +# the built documents. +# +# The short X.Y version. +version = melusine.__version__ +# The full version, including alpha/beta/rc tags. +release = melusine.__version__ + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# +# This is also used if you do content translation via gettext catalogs. +# Usually you set "language" from the command line for these cases. +language = 'en' + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This patterns also effect to html_static_path and html_extra_path +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + +# If true, `todo` and `todoList` produce output, else they produce nothing. +todo_include_todos = False + + +# -- Options for HTML output ------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = 'sphinx_rtd_theme' +# html_theme_path = sphinx_bootstrap_theme.get_html_theme_path() + +# Theme options are theme-specific and customize the look and feel of a +# theme further. For a list of options available for each theme, see the +# documentation. +# +# theme further. +# html_theme_options = { +# # Navigation bar title. (Default: ``project`` value) +# 'navbar_title': "Demo", +# +# # Tab name for entire site. (Default: "Site") +# 'navbar_site_name': "Site", +# +# # A list of tuples containing pages or urls to link to. +# # Valid tuples should be in the following forms: +# # (name, page) # a link to a page +# # (name, "/aa/bb", 1) # a link to an arbitrary relative url +# # (name, "http://example.com", True) # arbitrary absolute url +# # Note the "1" or "True" value above as the third argument to indicate +# # an arbitrary url. +# 'navbar_links': [ +# ("Examples", "examples"), +# ("Link", "http://example.com", True), +# ], +# +# # Render the next and previous page links in navbar. (Default: true) +# 'navbar_sidebarrel': True, +# +# # Render the current pages TOC in the navbar. (Default: true) +# 'navbar_pagenav': True, +# +# # Tab name for the current pages TOC. (Default: "Page") +# 'navbar_pagenav_name': "Page", +# +# # Global TOC depth for "site" navbar tab. (Default: 1) +# # Switching to -1 shows all levels. +# 'globaltoc_depth': 2, +# +# # Include hidden TOCs in Site navbar? +# # +# # Note: If this is "false", you cannot have mixed ``:hidden:`` and +# # non-hidden ``toctree`` directives in the same page, or else the build +# # will break. +# # +# # Values: "true" (default) or "false" +# 'globaltoc_includehidden': "true", +# +# # HTML navbar class (Default: "navbar") to attach to
element. +# # For black navbar, do "navbar navbar-inverse" +# 'navbar_class': "navbar navbar-inverse", +# +# # Fix navigation bar to top of page? +# # Values: "true" (default) or "false" +# 'navbar_fixed_top': "true", +# +# # Location of link to source. +# # Options are "nav" (default), "footer" or anything else to exclude. +# 'source_link_position': "nav", +# +# # Bootswatch (http://bootswatch.com/) theme. +# # +# # Options are nothing (default) or the name of a valid theme +# # such as "cosmo" or "sandstone". +# # +# # The set of valid themes depend on the version of Bootstrap +# # that's used (the next config option). +# # +# # Currently, the supported themes are: +# # - Bootstrap 2: https://bootswatch.com/2 +# # - Bootstrap 3: https://bootswatch.com/3 +# 'bootswatch_theme': "cosmo", +# +# # Choose Bootstrap version. +# # Values: "3" (default) or "2" (in quotes) +# 'bootstrap_version': "3", +# } + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + + +# -- Options for HTMLHelp output --------------------------------------- + +# Output file base name for HTML help builder. +htmlhelp_basename = 'melusinedoc' + + +# -- Options for LaTeX output ------------------------------------------ + +latex_elements = { + # The paper size ('letterpaper' or 'a4paper'). + # + # 'papersize': 'letterpaper', + + # The font size ('10pt', '11pt' or '12pt'). + # + # 'pointsize': '10pt', + + # Additional stuff for the LaTeX preamble. + # + # 'preamble': '', + + # Latex figure (float) alignment + # + # 'figure_align': 'htbp', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, author, documentclass +# [howto, manual, or own class]). +latex_documents = [ + (master_doc, 'melusine.tex', + u'melusine Documentation', + u'Sacha Samama & Tom Stringer', 'manual'), +] + + +# -- Options for manual page output ------------------------------------ + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + (master_doc, 'melusine', + u'Melusine Documentation', + [author], 1) +] + + +# -- Options for Texinfo output ---------------------------------------- + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + (master_doc, 'melusine', + u'Melusine Documentation', + author, + 'Melusine', + 'One line description of project.', + 'Miscellaneous'), +] diff --git a/docs/contributing.rst b/docs/contributing.rst new file mode 100644 index 00000000..e582053e --- /dev/null +++ b/docs/contributing.rst @@ -0,0 +1 @@ +.. include:: ../CONTRIBUTING.rst diff --git a/docs/dependencies.rst b/docs/dependencies.rst new file mode 100644 index 00000000..67dec959 --- /dev/null +++ b/docs/dependencies.rst @@ -0,0 +1,13 @@ +#################### +Package dependencies +#################### + +**Melusine** works with Python >= 3.5. It builds on top of following packages: + +- ``pandas>=0.22.0`` +- ``scikit-learn>=0.19.0`` +- ``gensim>=3.3.0`` +- ``nltk>=3.3`` +- ``keras>=2.2.0`` +- ``tqdm>=4.14`` +- ``unidecode`` diff --git a/docs/history.rst b/docs/history.rst new file mode 100644 index 00000000..25064996 --- /dev/null +++ b/docs/history.rst @@ -0,0 +1 @@ +.. include:: ../HISTORY.rst diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 00000000..e6fe900f --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,44 @@ +Welcome to Melusine's documentation ! +===================================== + +.. image:: ./_static/melusine.png + :align: center + :scale: 50% + +This package aims to be the reference in the processing and modelization of French email data. +The project has been developed by Quantmetry and MAIF teams. + ++----------------------------------------+----------------------------------------------------------+----------------------------------------------------------+ +| Company | GitHub Account | Website | ++========================================+==========================================================+==========================================================+ +| .. image:: ./_static/quantmetry.png | `Quantmetry GitHub `_ | `Quantmetry `_ | ++----------------------------------------+----------------------------------------------------------+----------------------------------------------------------+ +| .. image:: ./_static/maif.png | `MAIF GitHub `_ | `MAIF `_ | +| :align: center | | | +| :scale: 60% | | | ++----------------------------------------+----------------------------------------------------------+----------------------------------------------------------+ + +.. toctree:: + :maxdepth: 2 + :numbered: + :caption: Contents + + readme + installation + usage + dependencies + +.. toctree:: + :maxdepth: 2 + :caption: API + + api + contributing + authors + history + +Indices and tables +================== +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` diff --git a/docs/installation.rst b/docs/installation.rst new file mode 100644 index 00000000..e4d15d0f --- /dev/null +++ b/docs/installation.rst @@ -0,0 +1,51 @@ +.. highlight:: shell + +============ +Installation +============ + + +Stable release +-------------- + +To install melusine, run this command in your terminal: + +.. code-block:: console + + $ pip install melusine + +This is the preferred method to install melusine, as it will always install the most recent stable release. + +If you don't have `pip`_ installed, this `Python installation guide`_ can guide +you through the process. + +.. _pip: https://pip.pypa.io +.. _Python installation guide: http://docs.python-guide.org/en/latest/starting/installation/ + + +From sources +------------ + +The sources for melusine can be downloaded from the `Github repo`_. + +You can either clone the public repository: + +.. code-block:: console + + $ git clone https://github.com/MAIF/melusine + +Or download the `tarball`_: + +.. code-block:: console + + $ curl -OL https://github.com/MAIF/melusine + +Once you have a copy of the source, you can install it with: + +.. code-block:: console + + $ python setup.py install + + +.. _Github repo: https://github.com/MAIF/melusine +.. _tarball: https://github.com/MAIF/melusine/tarball/master diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 00000000..ff00aa03 --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,36 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=python -msphinx +) +set SOURCEDIR=. +set BUILDDIR=_build +set SPHINXPROJ=skmail + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The Sphinx module was not found. Make sure you have Sphinx installed, + echo.then set the SPHINXBUILD environment variable to point to the full + echo.path of the 'sphinx-build' executable. Alternatively you may add the + echo.Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% + +:end +popd diff --git a/docs/melusine.config.rst b/docs/melusine.config.rst new file mode 100644 index 00000000..7deab550 --- /dev/null +++ b/docs/melusine.config.rst @@ -0,0 +1,22 @@ +.. _config: + +======================================== +Config subpackage :mod:`melusine.config` +======================================== + +TODO : ADD DESCRIPTION OF THE SUBPACKAGE + +------------------ +List of submodules +------------------ +.. contents:: + :local: + + +Config :mod:`melusine.config.config` +------------------------------------ + +.. automodule:: melusine.config.config + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/melusine.models.rst b/docs/melusine.models.rst new file mode 100644 index 00000000..6f472e8f --- /dev/null +++ b/docs/melusine.models.rst @@ -0,0 +1,35 @@ +.. _models: + +======================================== +Models subpackage :mod:`melusine.models` +======================================== + +TODO : ADD DESCRIPTION OF THE SUBPACKAGE + +------------------ +List of submodules +------------------ +.. contents:: + :local: + + +.. _neural_architectures: + +NeuralArchitectures :mod:`melusine.models.neural_architectures` +--------------------------------------------------------------- + +.. automodule:: melusine.models.neural_architectures + :members: + :undoc-members: + :show-inheritance: + + +.. _train: + +Train :mod:`melusine.models.train` +---------------------------------- + +.. automodule:: melusine.models.train + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/melusine.nlp_tools.rst b/docs/melusine.nlp_tools.rst new file mode 100644 index 00000000..9be27f19 --- /dev/null +++ b/docs/melusine.nlp_tools.rst @@ -0,0 +1,46 @@ +.. _nlp_tools: + +=============================================== +Nlp\_tools subpackage :mod:`melusine.nlp_tools` +=============================================== + +TODO : ADD DESCRIPTION OF THE SUBPACKAGE + +------------------ +List of submodules +------------------ +.. contents:: + :local: + + +.. _tokenizer: + +Tokenizer :mod:`melusine.nlp_tools.tokenizer` +--------------------------------------------- + +.. automodule:: melusine.nlp_tools.tokenizer + :members: + :undoc-members: + :show-inheritance: + + +.. _phraser: + +Phraser :mod:`melusine.nlp_tools.phraser` +----------------------------------------- + +.. automodule:: melusine.nlp_tools.phraser + :members: + :undoc-members: + :show-inheritance: + + +.. _embedding: + +Embedding :mod:`melusine.nlp_tools.embedding` +--------------------------------------------- + +.. automodule:: melusine.nlp_tools.embedding + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/melusine.prepare_email.rst b/docs/melusine.prepare_email.rst new file mode 100644 index 00000000..9fdd93a3 --- /dev/null +++ b/docs/melusine.prepare_email.rst @@ -0,0 +1,77 @@ +.. _prepare_email: + +======================================================= +Prepare\_email subpackage :mod:`melusine.prepare_email` +======================================================= + +.. image:: ./_static/schema_2.png + :align: center + :scale: 30% + +------------------ +List of submodules +------------------ +.. contents:: + :local: + + +.. _manage_transfer_reply: + +Transfer & Reply :mod:`melusine.prepare_email.manage_transfer_reply` +-------------------------------------------------------------------- + +.. automodule:: melusine.prepare_email.manage_transfer_reply + :members: + :undoc-members: + :show-inheritance: + + +.. _cleaning: + +Cleaning :mod:`melusine.prepare_email.cleaning` +----------------------------------------------- + +.. automodule:: melusine.prepare_email.cleaning + :members: + :undoc-members: + :show-inheritance: + +.. _build_historic: + +Build Email Historic :mod:`melusine.prepare_email.build_historic` +----------------------------------------------------------------- + +.. automodule:: melusine.prepare_email.build_historic + :members: + :undoc-members: + :show-inheritance: + +.. _mail_segmenting: + +Email Segmenting :mod:`melusine.prepare_email.mail_segmenting` +-------------------------------------------------------------- + +.. automodule:: melusine.prepare_email.mail_segmenting + :members: + :undoc-members: + :show-inheritance: + +.. _metadata_engineering: + +Process Email Metadata :mod:`melusine.prepare_email.metadata_engineering` +------------------------------------------------------------------------- + +.. automodule:: melusine.prepare_email.metadata_engineering + :members: + :undoc-members: + :show-inheritance: + +.. _body_header_extraction: + +Extract Email Body & Header :mod:`melusine.prepare_email.body_header_extraction` +-------------------------------------------------------------------------------- + +.. automodule:: melusine.prepare_email.body_header_extraction + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/melusine.summarizer.rst b/docs/melusine.summarizer.rst new file mode 100644 index 00000000..7171dd08 --- /dev/null +++ b/docs/melusine.summarizer.rst @@ -0,0 +1,24 @@ +.. _summarizer: + +================================================ +Summarizer subpackage :mod:`melusine.summarizer` +================================================ + +TODO : ADD DESCRIPTION OF THE SUBPACKAGE + +------------------ +List of submodules +------------------ +.. contents:: + :local: + + +.. _keywords_generator: + +KeywordsGenerator :mod:`melusine.summarizer.keywords_generator` +--------------------------------------------------------------- + +.. automodule:: melusine.summarizer.keywords_generator + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/melusine.utils.rst b/docs/melusine.utils.rst new file mode 100644 index 00000000..eba22a0e --- /dev/null +++ b/docs/melusine.utils.rst @@ -0,0 +1,57 @@ +.. _utils: + +====================================== +Utils subpackage :mod:`melusine.utils` +====================================== + +TODO : ADD DESCRIPTION OF THE SUBPACKAGE + +------------------ +List of submodules +------------------ +.. contents:: + :local: + + +.. _transformerScheduler: + +TransformerScheduler :mod:`melusine.utils.transformer_scheduler` +---------------------------------------------------------------- + +.. automodule:: melusine.utils.transformer_scheduler + :members: + :undoc-members: + :show-inheritance: + + +.. _multiprocessing: + +Multiprocessing :mod:`melusine.utils.multiprocessing` +----------------------------------------------------- + +.. automodule:: melusine.utils.multiprocessing + :members: + :undoc-members: + :show-inheritance: + + +.. _streamer: + +Streamer :mod:`melusine.utils.streamer` +--------------------------------------- + +.. automodule:: melusine.utils.streamer + :members: + :undoc-members: + :show-inheritance: + + +.. _printParts: + +PrintParts :mod:`melusine.utils.printer` +---------------------------------------- + +.. automodule:: melusine.utils.printer + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/readme.rst b/docs/readme.rst new file mode 100644 index 00000000..31b90948 --- /dev/null +++ b/docs/readme.rst @@ -0,0 +1,330 @@ +======== +Melusine +======== + +.. image:: ./_static/melusine.png + :align: center + :scale: 50% + +.. image:: https://img.shields.io/pypi/v/melusine.svg + :target: https://pypi.python.org/pypi/melusine + +.. image:: https://img.shields.io/travis/sachasamama/melusine.svg + :target: https://travis-ci.org/sachasamama/melusine + +.. image:: https://readthedocs.org/projects/melusine/badge/?version=latest + :target: https://melusine.readthedocs.io/en/latest/?badge=latest + :alt: Documentation Status + +* Free software: Apache Software License 2.0 +* Documentation: https://melusine.readthedocs.io. + + +Overview +-------- +**Melusine** is a high-level Scikit-Learn API for emails classification and feature extraction, +written in Python and capable of running on top of Scikit-Learn, Keras or Tensorflow. +It was developed with a focus on emails written in french. + +Use **Melusine** if you need a library which : + * Supports both convolutional networks and recurrent networks, as well as combinations of the two. + * Runs seamlessly on CPU and GPU. + +**Melusine** is compatible with ``Python >= 3.5``. + +.. + Guiding principles + ------------------ + * **Modularity :** A model is understood as a sequence of standalone, fully configurable modules that can be plugged together with as few restrictions as possible. In particular, classification models, cleaning functions and summarization models are all standalone modules that you can combine to create new models. + + * **Easy extensibility :** New modules are simple to add (as new classes and functions), and existing modules provide ample examples. To be able to easily create new modules allows for total expressiveness, making Melusine suitable for specific goals. + + * **Work with Python :** No separate models configuration files in a declarative format. Models are described in Python code, which is compact, easier to debug, and allows for ease of extensibility. + + +The Melusine package +--------------------- + +.. image:: ./_static/schema_1.png + :align: center + :scale: 30% + +This package is designed for the preprocessing, classification and automatic summarization of emails written in french. +**3 main subpackages are offered :** + +* :ref:`prepare_email ` : to preprocess and clean the emails. +* :ref:`summarizer ` : to extract keywords from an email. +* :ref:`models ` : to classify e-mails according to categories pre-defined defined by the user. + +**2 other subpackages are offered as building blocks :** + +* :ref:`nlp_tools ` : to provide classic NLP tools such as tokenizer, phraser and embeddings. +* :ref:`utils ` : to provide a *TransformerScheduler* class to build your own transformer and integrate it into a scikit-learn Pipeline. + +**An other subpackage is also provided** to manage, modify or add parameters such as : regular expressions, keywords, stopwords, etc. + +* :ref:`config ` : contains *ConfigJsonReader* class to setup and handle a *conf.json* file. This JSON file is the core of this package since it's used by different submodules to preprocess the data. + + +Getting started: 30 seconds to Melusine +--------------------------------------- +Importing Melusine +^^^^^^^^^^^^^^^^^^ +To use Melusine in a project:: + + import melusine + +Input data : Email DataFrame +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +The basic requirement to use Melusine is to have an input e-mail DataFrame with the following columns: + + - *body* : Body of an email (single message or conversation historic) + - *header* : Header of an email + - *date* : Reception date of an email + - *from* : Email address of the sender + - *to* : Email address of the recipient + - *label* (optional): Label of the email for a classification task (examples: Business, Spam, Finance or Family) + +.. csv-table:: + :header: body, header, date, from, to, label + + "Thank you.\\nBye,\\nJohn", "Re: Your order", "jeudi 24 mai 2018 11 h 49 CEST", "anonymous.sender@unknown.com", "anonymous.recipient@unknown.fr", "??" + +Pre-processing pipeline +^^^^^^^^^^^^^^^^^^^^^^^ +A working pre-processing pipeline is given below:: + + from sklearn.pipeline import Pipeline + from melusine.utils.transformer_scheduler import TransformerScheduler + from melusine.prepare_email.manage_transfer_reply import check_mail_begin_by_transfer + from melusine.prepare_email.manage_transfer_reply import update_info_for_transfer_mail + from melusine.prepare_email.manage_transfer_reply import add_boolean_answer + from melusine.prepare_email.manage_transfer_reply import add_boolean_transfer + from melusine.prepare_email.build_historic import build_historic + from melusine.prepare_email.mail_segmenting import structure_email + from melusine.prepare_email.body_header_extraction import extract_last_body + from melusine.prepare_email.cleaning import clean_body + from melusine.prepare_email.cleaning import clean_header + + ManageTransferReply = TransformerScheduler( + functions_scheduler=[ + (check_mail_begin_by_transfer, None, ['is_begin_by_transfer']), + (update_info_for_transfer_mail, None, None), + (add_boolean_answer, None, ['is_answer']), + (add_boolean_transfer, None, ['is_transfer']) + ]) + + EmailSegmenting = TransformerScheduler( + functions_scheduler=[ + (build_historic, None, ['structured_historic']), + (structure_email, None, ['structured_body']) + ]) + + Cleaning = TransformerScheduler( + functions_scheduler=[ + (extract_last_body, None, ['last_body']), + (clean_body, None, ['clean_body']), + (clean_header, None, ['clean_header']) + ]) + + prepare_data_pipeline = Pipeline([ + ('ManageTransferReply', ManageTransferReply), + ('EmailSegmenting', EmailSegmenting), + ('Cleaning', Cleaning), + ]) + + df_email = prepare_data_pipeline.fit_transform(df_email) + +In this example, the pre-processing functions applied are: + + - :ref:`check_mail_begin_by_transfer ` : Email is a direct transfer (True/False) + - :ref:`update_info_for_transfer_mail ` : Update body, header, from, to, date if direct transfer + - :ref:`add_boolean_answer` : Email is an answer (True/False) + - :ref:`add_boolean_transfer` : Email is transferred (True/False) + - :ref:`build_historic` : When email is a conversation, reconstructs the individual message historic + - :ref:`structure_email` : Splits parts of each messages in historic and tags them (tags: Hello, Body, Greetings, etc) + +Phraser and Tokenizer pipeline +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +A pipeline to train and apply the phraser end tokenizer is given below:: + + from melusine.nlp_tools.phraser import Phraser + from melusine.nlp_tools.tokenizer import Tokenizer + + phraser = Phraser(columns='clean_body') + phraser.train(df_email) + phraser.save('./phraser.pkl') + phraser = Phraser().load('./phraser.pkl') + + PhraserTransformer = TransformerScheduler( + functions_scheduler=[ + (phraser_on_body, (phraser,), ['clean_body']), + (phraser_on_header, (phraser,), ['clean_header']) + ]) + + phraser_tokenizer_pipeline = Pipeline([ + ('PhraserTransformer', PhraserTransformer), + ('Tokenizer', Tokenizer(columns=['clean_body', 'clean_header'])) + ]) + + df_email = phraser_tokenizer_pipeline.fit_transform(df_email) + +Embeddings training +^^^^^^^^^^^^^^^^^^^ +An example of embedding training is given below:: + + from melusine.nlp_tools.embedding import Embedding + + embedding = Embedding(columns='clean_body') + embedding.train(df_email) + embedding.save('./embedding.pkl') + + +Metadata pipeline +^^^^^^^^^^^^^^^^^ +A pipeline to prepare the metadata is given below:: + + from melusine.prepare_email.metadata_engineering import MetaExtension + from melusine.prepare_email.metadata_engineering import MetaDate + from melusine.prepare_email.metadata_engineering import Dummifier + + metadata_pipeline = Pipeline([ + ('MetaExtension', MetaExtension()), + ('MetaDate', MetaDate()), + ('Dummifier', Dummifier(columns_to_dummify=['extension', 'dayofweek', 'hour'])) + ]) + + df_meta = metadata_pipeline.fit_transform(df_email) + +Keywords extraction +^^^^^^^^^^^^^^^^^^^ +An example of keywords extraction is given below:: + + from melusine.summarizer.keywords_generator import KeywordsGenerator + + keywords_generator = KeywordsGenerator() + df_email = phraser_tokenizer_pipeline.fit_transform(df_email) + +Classification +^^^^^^^^^^^^^^ +An example of classification is given below:: + + from sklearn.preprocessing import LabelEncoder + from melusine.nlp_tools.embedding import Embedding + from melusine.models.neural_architectures import cnn_model + from melusine.models.train import NeuralModel + + X = df_email.drop(['label'], axis=1) + y = df_email.label + + le = LabelEncoder() + y = le.fit_transform(y) + + pretrained_embedding = Embedding().load(./embedding.pkl) + + nn_model = NeuralModel(neural_architecture_function=cnn_model, + pretrained_embedding=pretrained_embedding) + nn_model.fit(X, y) + y_res = nn_model.transform(X_test) + + +Glossary +-------- +Pandas dataframes columns +^^^^^^^^^^^^^^^^^^^^^^^^^ +Because Melusine manipulates pandas dataframes, the naming of the columns is imposed. +Here is a basic glossary to provide an understanding of each columns manipulated. +Initial columns of the dataframe: + +* **body :** the body of the email. It can be composed of a unique message, a historic of messages, a transfer of messages or a combination of historics and transfers. +* **header :** the subject of the email. +* **date :** the date the email has been sent. It corresponds to the date of the last message of the email has been written. +* **from :** the email address of the author of the last message of the email. +* **to :** the email address of the recipient of the last message. + +Columns added by Melusine: + +* **is_begin_by_transfer :** boolean, indicates if the email is a direct transfer. In that case it is recommended to update the value of the initial columns with the informations of the message transferred. +* **is_answer :** boolean, indicates if the email contains a historic of messages +* **is_transfer :** boolean, indicates if the email is a transfer (in that case it does not have to be a direct transfer). +* **structured_historic :** list of dictionaries, each dictionary corresponds to a message of the email. The first dictionary corresponds to the last message (the one that has been written) while the last dictionary corresponds to the first message of the historic. Each dictionary has two keys : + + - *meta :* to access the metadata of the message as a string. + - *text :* to access the message itself as a string. + +* **structured_body :** list of dictionaries, each dictionary corresponds to a message of the email. The first dictionary corresponds to the last message (the one that has been written) while the last dictionary corresponds to the first message of the historic. Each dictionary has two keys : + + - *meta :* to access the metadata of the message as a dictionary. The dictionary has three keys: + + + *date :* the date of the message. + + *from :* the email address of the author of the message. + + *to :* the email address of the recipient of the message. + + - *text :* to access the message itself as a dictionary. The dictionary has two keys: + + + *header :* the subject of the message. + + *structured_text :* the different parts of the message segmented and tagged as a list of dictionaries. Each dictionary has two keys: + + - *part :* to access the part of the message as a string. + - *tags :* to access the tag of the part of the message. + +* **last_body :** string, corresponds to the part of the last message of the email that has been tagged as "BODY". +* **clean_body :** string, corresponds a cleaned last_body. +* **clean_header :** string, corresponds to a cleaned header. +* **clean_text :** string, concatenation of clean_header and clean_body. +* **tokens :** list of strings, corresponds to a tokenized column, by default clean_text. +* **keywords :** list of strings, corresponds to the keywords of extracted from the tokens column. + +Tags +^^^^ +Each messages of an email are segmented the in the **structured_body** columns and each parts are assigned a tag: + +* "RE/TR" : any metadata such as date, from, to etc. +* "DISCLAIMER" : any disclaimer such as "L'émetteur décline toute responsabilité...". +* "GREETINGS" : any greetings such as "Salutations". +* "PJ" : any indication of an attached document such as "See attached file...". +* "FOOTER" : any footer such as "Provenance : Courrier pour Windows". +* "HELLO" : any salutations such as "Bonjour,". +* "THANKS" : any thanks such as "Avec mes remerciements" +* "BODY" : the core of the the message which contains the valuable information. + + +Motivation & history +-------------------- + +Origin of the project +^^^^^^^^^^^^^^^^^^^^^ +**MAIF**, being one of the leading mutual insurance companies in France, receives daily a large volume of emails from its clients +and is under pressure to reply to their requests as efficiently as possible. As such an efficient routing system is of the +upmost importance to assign each emails to its right entity. +However the previously outdated routing system put the company under ever increasing difficulties to fulfill its pledge. +In order to face up to this challenge, MAIF in collaboration with **Quantmetry**, has implemented a new routing system +based on state-of-the-art NLP and Deep Learning techniques that would classify each email under the right label +according to its content and extract the relevant information to help the MAIF counsellors processing the emails. + +Ambitions of the project +^^^^^^^^^^^^^^^^^^^^^^^^ +**Melusine** is the first Open Source and free-of-use solution dedicated specifically to the qualification of e-mails written in french. +The ambition of this Python package is to become a reference, but also to live in the French NLP community by federating users and contributors. +Initially developed to answer the problem of routing e-mails received by the MAIF, the solution was implemented using state-of-the-art techniques in Deep Learning and NLP. +Melusine can be interfaced with Scikit-Learn: it offers the user the possibility to train his own classification and automatic summarization model according to the constraints of his problem. + +The collaboration between Quantmetry and MAIF +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +After collaborating for the implementation of its routing system with Quantmetry, +a pure player consulting firm in AI, MAIF pursued the partnership to develop the *Melusine* package. + +Why Melusine ? +^^^^^^^^^^^^^^ +Following MAIF's tradition to name its open source packages after deities, it was chosen to release this package +under the name of Melusine as an homage to a legend from the local folklore in the Poitou region in France +where MAIF is historically based. + + +Credits +------- + +This package was created with Cookiecutter_ and the `audreyr/cookiecutter-pypackage`_ project template. + +.. _Cookiecutter: https://github.com/audreyr/cookiecutter +.. _`audreyr/cookiecutter-pypackage`: https://github.com/audreyr/cookiecutter-pypackage diff --git a/docs/usage.rst b/docs/usage.rst new file mode 100644 index 00000000..0227e335 --- /dev/null +++ b/docs/usage.rst @@ -0,0 +1,565 @@ +===== +Usage +===== + +To use melusine in a project:: + + import melusine + +Melusine input data : Email DataFrames +-------------------------------------- + +The basic requirement to use Melusine is to have an input e-mail DataFrame with the following columns: + + - ``body`` : Body of an email (single message or conversation historic) + - ``header``: Header of an email + - ``date`` : Reception date of an email + - ``from`` : Email address of the sender + - ``to`` : Email address of the recipient + - ``label`` (optional) : Label of the email for a classification task (examples: Business, Spam, Finance or Family) + +.. csv-table:: + :header: body, header, date, from, to, label + + "Thank you.\\nBye,\\nJohn", "Re: Your order", "jeudi 24 mai 2018 11 h 49 CEST", "anonymous.sender@unknown.com", "anonymous.recipient@unknown.fr", "A" + +In the examples presented below, a toy email DataFrame containing anonymized emails is used. +The toy DataFrame can be loaded as follows:: + + import melusine + import pandas as pd + + df_email = pd.read_pickle('./tutorial/data/emails_anonymized.pickle') + df_email.head() + + +Prepare email subpackage : Basic usage +-------------------------------------- + +A common pre-processing step is to check whether an e-mail is an answer or not. +This can be done in Melusine with the function :ref:`add_boolean_answer`:: + + from melusine.prepare_email.manage_transfer_reply import add_boolean_answer + + df_email['is_answer'] = df_email.apply(add_boolean_answer, axis=1) + + +A new column ``is_answer`` is created containing a boolean variable: + + - True if the message is an answer + - False if the message is not an answer + +.. csv-table:: + :header: body, header, is_answer + + "Happy Birthday Bill!!", "Birthday", False + "Thank you", "Re: Birthday", True + +Create an email pre-processing pipeline +--------------------------------------- + +An email pre-processing pipeline takes an email DataFrame as input and executes a sequence of *Transformers* +on every email in the DataFrame. +The recommended way to create a pre-processing pipeline with Melusine is to: + + 1. Wrap pre-processing functions in :ref:`TransformerScheduler` objects. + 2. Use a `Scikit-Learn Pipeline `_ object to chain transformers + +Once the pipeline has been set-up, the pre-processing of an email DataFrame is straightforward: + + >>> df_email_preprocessed = pipeline.fit_transform(df_email) + +TransformerScheduler class +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Functions can be wrapped in a :ref:`TransformerScheduler` object that can be integrated into an execution Pipeline. +``TransformerScheduler`` objects are compatible with the `scikit-learn API `_ +(they have fit and transform methods). + + +A ``TransformerScheduler`` object is initialized with a functions_scheduler argument. +The functions_scheduler argument is a list of tuples containing information about the desired pre-processing functions. +Each tuple describe an individual function and should contain the following elements: + + 1. A function + 2. A tuple with the function's arguments + (if no arguments are required, use None or an empty tuple) + 3. A column(s) name list returned by the function + (if no arguments are required, use None or an empty list) + +The code below describes the definition of a transformer:: + + from melusine.utils.transformer_scheduler import TransformerScheduler + + melusine_transformer = TransformerScheduler( + functions_scheduler=[ + (my_function_1, (argument1, argument2), ['return_col_A']), + (my_function_2, None, ['return_col_B', 'return_col_C']) + (my_function_3, (argument1, ), None), + mode='apply_by_multiprocessing', + n_jobs=4) + ]) + +The other parameters of the *TransformerScheduler* class are: + + - ``mode`` (optional): Define mode to apply function along a row axis (axis=1) + If set to 'apply_by_multiprocessing', it uses multiprocessing tool to parallelize computation. + Possible values are 'apply' (default) and 'apply_by_multiprocessing' + + - ``n_jobs`` (optional): Number of cores used for computation. Default value, 1. + Possible values are integers ranging from 1 (default) to the number of cores available for computation + +A TransformerScheduler can be used independently or included in a scikit pipeline (recommended): + + >>> # Used independently + >>> df_email = melusine_transformer.fit_transform(df_email) + + >>> # Used in a scikit pipeline + >>> from sklearn.pipeline import Pipeline + >>> pipeline = Pipeline([('MelusineTransformer', melusine_transformer)]) + >>> df_email = pipeline.fit_transform(df_email) + +The *fit_transform* method returns a DataFrame with new features (new columns) + +.. csv-table:: + :header: body, header, return_col_A, return_col_B, return_col_C, return_col_D + + "Happy Birthday Bill!!", "Birthday", "new_feature_A", "new_feature_B", "new_feature_C", "new_feature_D" + "Thank you", "Re: Birthday", "new_feature_A", "new_feature_B", "new_feature_C", "new_feature_D" + + +Chaining transformers in a scikit-learn pipeline +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Once all the desired functions and transformers have been defined, transformers can be chained in a `Scikit-Learn Pipeline `_. +The code below describes the definition of a pipeline:: + + from sklearn.pipeline import Pipeline + + pipeline = Pipeline([ + ('TransformerName1', TransformerObject1), + ('TransformerName2', TransformerObject2), + ('TransformerName3', TransformerObject3), + ]) + +Example of a working pipeline +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +A working pre-processing pipeline is given below:: + + from sklearn.pipeline import Pipeline + from melusine.utils.transformer_scheduler import TransformerScheduler + from melusine.prepare_email.manage_transfer_reply import add_boolean_answer, add_boolean_transfer + from melusine.prepare_email.build_historic import build_historic + from melusine.prepare_email.mail_segmenting import structure_email + + ManageTransferReply = TransformerScheduler( + functions_scheduler=[ + (add_boolean_answer, None, ['is_answer']), + (add_boolean_transfer, None, ['is_transfer']) + ]) + + HistoricBuilder = TransformerScheduler( + functions_scheduler=[ + (build_historic, None, ['structured_historic']), + ]) + + Segmenting = TransformerScheduler( + functions_scheduler=[ + (structure_email, None, ['structured_body']) + ]) + + prepare_data_pipeline = Pipeline([ + ('ManageTransferReply', ManageTransferReply), + ('HistoricBuilder', HistoricBuilder), + ('Segmenting', Segmenting), + ]) + + df_email = prepare_data_pipeline.fit_transform(df_email) + +In this example, the pre-processing functions applied are: + + - :ref:`add_boolean_answer` : Email is an answer (True/False) + - :ref:`add_boolean_transfer` : Email is transferred (True/False) + - :ref:`build_historic` : When email is a conversation, reconstructs the individual message historic + - :ref:`structure_email` : Splits parts of each messages in historic and tags them (tags: Hello, Body, Greetings, etc) + +Create a custom email pre-processing function +---------------------------------------------- + +Creating a custom pre-processing function and adding it to a pre-processing pipeline can be done easily with *Melusine*. +Two main requirements are: + + 1. Make the function compatible with the pandas apply method + * First argument should be 'row' (Row of an email DataFrame) + >>> def my_function(row, arg1, arg2): + * Example: row['header'] will contain the header of a message + 2. Make sure to call existing columns of the DataFrame + * Don't call row['is_answer'] before the 'is_answer' column has been created + +The following example creates a custom function to count the occurrence of a word in the body of an email:: + + from sklearn.pipeline import Pipeline + from melusine.utils.transformer_scheduler import TransformerScheduler + from melusine.prepare_email.manage_transfer_reply import add_boolean_answer, add_boolean_transfer + + # Create a fake email Dataframe + df_duck = pd.DataFrame({ + "body" : ["Lion Duck Pony", "Duck Pony Pony", "Duck Duck Pony"], + "header" : ["zoo report", "Tr : zoo report", "Re : zoo report"] + }) + + # Define a custom function + def count_word_occurrence_in_body(row, word): + all_word_list = row["body"].lower().split() + word_occurence = all_word_list.count(word) + return word_occurence + + # Wrap function in a transformer + CountWordOccurrence = TransformerScheduler( + functions_scheduler=[ + (count_word_occurrence_in_body, ("duck",), ['duck_count']), + (count_word_occurrence_in_body, ("pony",), ['pony_count']), + ]) + + # Create a second transformer with regular Melusine functions + ManageTransferReply = TransformerScheduler( + functions_scheduler=[ + (add_boolean_answer, None, ['is_answer']), + (add_boolean_transfer, None, ['is_transfer']) + ]) + + # Chain transformers in a pipeline + prepare_data_pipeline = Pipeline([ + ('CountWordOccurrence', CountWordOccurrence), # Transformer with custom functions + ('ManageTransferReply', ManageTransferReply), # Transformer with regular Melusine functions + ]) + + # Pre-process input DataFrame + df_duck_prep = prepare_data_pipeline.fit_transform(df_duck) + +.. csv-table:: + :header: body, header, duck_count, pony_count, is_answer, is_transfer + + "Lion Duck Pony", "zoo report", "1", "1", False, False + "Duck Duck Pony", "Re : zoo report", "2", "1", "True", "False" + "Duck Pony Pony", "Tr : zoo report", "1", "2", False, False + +Note : It is totally fine to mix regular and custom functions in a transformer. + +Testing a function on a single email +------------------------------------ + +Since all pre-processing functions are made compatible with pandas apply function, +a function can be tested on a single email. +In the example below, the function :ref:`add_boolean_answer` is tested on a single email:: + + from melusine.prepare_email.manage_transfer_reply import add_boolean_answer + + email_index = 2 + email_is_answer = add_boolean_answer(df_email.iloc[email_index]) + print("Message %d is an answer: %r" %(email_index, email_is_answer)) + +Output:: + + "Message 2 is an answer: True" + + + +NLP tools subpackage +-------------------- + +The different classes of the NLP tools subpackage are described in this section. + +Phraser +^^^^^^^ + +The Melusine :ref:`Phraser ` class transforms common multi-word expressions into single elements: + + >>> new york -> new_york + +To train a Melusine Phraser (which is based on a `Gensim Phraser `_), +the input email DataFrame should contain a 'clean_body' column which can be created with the :ref:`clean_body` function. + +In the example below, a Phraser is trained on a toy DataFrame:: + + from melusine.nlp_tools.phraser import Phraser + from melusine.nlp_tools.phraser import phraser_on_text + + phraser = Phraser() + df_new_york = pd.DataFrame({ + 'clean_body' : ["new york is so cool", "i love new york", "new york city"] + }) + + phraser.train(df_new_york) + + df_new_york['clean_body'] = df_new_york['clean_body'].apply(phraser_on_text, args=(phraser,)) + + # Save the Phraser instance to disk + phraser.save(filepath) + # Load the Phraser + phraser = phraser().load(filepath) + +In reality, a training set with only 3 emails is too small to train a Phraser. +For illustrative purpose, the table below shows the expected output. + +.. csv-table:: + :header: clean_body, clean_body_new + + "new york is so cool", "new_york is so cool" + "i love new york", "i love new_york" + "new york city", "new_york city" + +The specific parameters of the :ref:`Phraser ` class are: + + - *common_terms* : list of stopwords to be ignored (default value = stopword list from NLTK) + - *threshold* : threshold to select collocations + - *min_count* : minimum count of word to be selected as collocation + +Tokenizer +^^^^^^^^^ + +A tokenizer splits a sentence-like string into a list of sub-strings (tokens). +The Melusine :ref:`Tokenizer ` class is based on a `NLTK regular expression tokenizer `_ +which uses a regular expression (regex) pattern to tokenize the text:: + + from melusine.nlp_tools.tokenizer import Tokenizer + + df_tok = pd.DataFrame({ + 'clean_body' : ["hello, i'm here to tokenize text. bye"], + 'clean_header' : ["re: hello"], + }) + + tokenizer = Tokenizer(columns=['clean_body', 'clean_header']) + df_tok = tokenizer.fit_transform(df_tok) + +A new column ``tokens`` is created with a list of the tokens extracted from the text data. + +.. csv-table:: + :header: clean_body, clean_header, tokens + + "hello, i'm here to tokenize text. bye", "re: hello", "['re', 'hello', 'hello', 'i', 'm', 'here', 'to', 'tokenize', 'text', 'bye']" + +The specific parameters of the :ref:`Tokenizer ` class are: + + - *stopwords* : list of keywords to be ignored (this list can be defined in the conf file) + - *stop_removal* : True if stopwords should be removed, else False + +Embeddings +^^^^^^^^^^ + +With a regular representation of words, there is one dimension for each word in the vocabulary +(set of all the words in a text corpus). +The computational cost of NLP tasks, such as training a neural network model, based on such a high dimensional space can be prohibitive. +`Word embeddings `_ are abstract representations of words in a lower dimensional vector space. +One of the advantages of word embeddings is thus to save computational cost. + +The Melusine :ref:`Embedding ` class uses the `Gensim Word2Vec module `_ to train a `word2vec model `_. +The trained Embedding object will be used in the :ref:`Models` subpackage to train a Neural Network to classify emails. + +The code below illustrates how the Embeddings class works. It should be noted that, in practice, to train a word embedding model, a lot of emails are required:: + + from melusine.nlp_tools.embedding import Embedding + + df_embeddings = pd.DataFrame({ + 'clean_body' : ["word text word text data word text"], + 'clean_header' : ["re: hello"], + }) + + embedding = Embedding(columns='clean_body', min_count=3) + embedding = embedding.train(df_embeddings) + + # Save the trained Embedding instance to disk + embedding.save('filepath') + + # Load the trained Embedding instance + embedding = Embedding().load(filepath) + + # Use trained Embedding to initialise the Neural Network Model + # The definition of a neural network model is not discussed in this section + nn_model : NeuralModel("...", pretrained_embedding=embedding, "...") + +Summarizer subpackage +--------------------- + +The main item of the :ref:`Summarizer` subpackage is the :ref:`KeywordGenerator` class. +The KeywordGenerator class extracts relevant keywords in the text data based on a `tf-idf `_ score. + +Requirements on the input DataFrame to use a KeywordGenerator: + + - KeywordGenerator requires a 'tokens' column which can be generated with a :ref:`Tokenizer ` + +Keywords can then be extracted as follows:: + + from melusine.summarizer.keywords_generator import KeywordsGenerator + from melusine.nlp_tools.tokenizer import Tokenizer + + + df_zoo = pd.DataFrame({ + 'clean_body' : ["i like lions and ponys and gorillas", "i like ponys and tigers", "i like tigers and lions", "i like raccoons and unicorns"], + 'clean_header' : ["things i like", "things i like", "things i like", "things i like"] + }) + + tokenizer = Tokenizer(columns=['clean_body', 'clean_header']) + # Create the 'tokens' column + df_zoo = tokenizer.fit_transform(df_zoo) + + keywords_generator = KeywordsGenerator(n_max_keywords=2, stopwords=['like']) + # Fit keyword generator on the text data corpus (using the tokens column) + keywords_generator.fit(X) + # Extract relevant keywords + keywords_generator.transform(X) + +In the text data of the example, some words are very common such as "i", "like" or "things", whereas other words are rare, such as "raccoons". +The keyword generator prioritise the rare words in the keyword extraction process: + +.. csv-table:: + :header: clean_body, clean_header, tokens, keywords + + "i like lions and ponies and gorillas", "things i like", "[things, i, i, lions, and, ponies, and, gorillas]", "[lions, ponys]" + "i like ponies and tigers", "things i like", "[things, i, i, ponies, and, tigers]", "[ponies, tigers]" + "i like tigers and lions", "things i like", "[things, i, i, tigers, and, lions]", "[tigers, lions]" + "i like raccoons and unicorns", "things i like", "[things, i, i, raccoons, and, unicorns]", "[raccoons, unicorns]" + +The specific parameters of the :ref:`KeywordGenerator` class are: + + - *max_tfidf_features* : size of vocabulary for tfidf + - *keywords* : list of keyword to be extracted in priority (this list can be defined in the conf file) + - *stopwords* : list of keywords to be ignored (this list can be defined in the conf file) + - *resample* : when DataFrame contains a 'label' column, balance the dataset by resampling + - *n_max_keywords* : maximum number of keywords to be returned for each email + - *n_min_keywords* : minimum number of keywords to be returned for each email + - *threshold_keywords* : minimum tf-idf score for a word to be selected as keyword + +Models subpackage +----------------- + +The main item of the Models subpackage is the :ref:`NeuralModel ` class. +The NeuralModel creates a Neural Network that can be trained and used to classify emails. + +The minimum input features required by the NeuralModel class are the following: + + - An email DataFrame with: + + - an integer 'label' column (a label encoder can be used to convert class names into integers) + - a 'clean_text' column with text data + - An instance of the :ref:`Embedding ` class (Trained word embedding model) + +The code below shows a minimal working example for Email Classification using a NeuralModel instance (a much larger training set is required to obtain meaningful results):: + + + # Prepare email + from melusine.utils.transformer_scheduler import TransformerScheduler + from melusine.prepare_email.manage_transfer_reply import \ + check_mail_begin_by_transfer, update_info_for_transfer_mail, add_boolean_answer, add_boolean_transfer + from melusine.prepare_email.build_historic import build_historic + from melusine.prepare_email.mail_segmenting import structure_email + from melusine.prepare_email.body_header_extraction import extract_last_body, extract_header + from melusine.prepare_email.cleaning import clean_body, clean_header + from melusine.prepare_email.metadata_engineering import MetaDate, MetaExtension, Dummifier + + # Scikit-Learn API + from sklearn.pipeline import Pipeline + + # NLP tools + from melusine.nlp_tools.tokenizer import Tokenizer + from melusine.nlp_tools.embedding import Embedding + + # Summarizer + from melusine.summarizer.keywords_generator import KeywordsGenerator + + # Models + from melusine.models.train import NeuralModel + from melusine.models.neural_architectures import cnn_model + + X = pd.read_pickle('./tutorial/data/emails_anonymized.pickle') + + # Convert 'label' column to integer values + X['label'] = X_train['label'].astype("category").cat.codes + + # Prepare mail + ManageTransferReply = TransformerScheduler( + functions_scheduler=[ + (check_mail_begin_by_transfer, (), ['is_begin_by_transfered']), + (update_info_for_transfer_mail, (), None), + (add_boolean_answer, (), ['is_answer']), + (add_boolean_transfer, (), ['is_transfer']) + ], + mode='apply_by_multiprocessing', + n_jobs=4) + + HistoricBuilder = TransformerScheduler( + functions_scheduler=[ + (build_historic, (), ['structured_historic']), + ], + mode='apply_by_multiprocessing', + n_jobs=4) + + Segmenting = TransformerScheduler( + functions_scheduler=[ + (structure_email, (), ['structured_body']) + ], + mode='apply_by_multiprocessing', + n_jobs=4) + + GetLastBodyHeader = TransformerScheduler( + functions_scheduler=[ + (extract_last_body, (), ['last_body']), + (extract_header, (), ['last_header']) + ], + mode='apply_by_multiprocessing', + n_jobs=4) + + Cleaner = TransformerScheduler( + functions_scheduler=[ + (clean_body, (), ['clean_body']), + (clean_header, (), ['clean_header']), + ], + mode='apply_by_multiprocessing', + n_jobs=4) + + prepare_data_pipeline = Pipeline([ + ('ManageTransferReply', ManageTransferReply), + ('HistoricBuilder', HistoricBuilder), + ('Segmenting', Segmenting), + ('GetLastBodyHeader', GetLastBodyHeader), + ('Cleaner', Cleaner), + ('MetaExtension', MetaExtension()), + ('MetaDate', MetaDate()), + ]) + + # Run prepare email pipeline + X = prepare_data_pipeline.fit_transform(X) + + # Dummify categorical data + categorical_cols = [cols for cols in ['extension', 'dayofweek', 'hour'] if cols in X.columns] + X_dummy = Dummifier(columns_to_dummify=categorical_cols).fit_transform(X) + + # Concatenate dummified features with original features + X_train = pd.concat([X, X_dummy], axis=1) + + # Create and train a word embedding model + embedding = Embedding(columns='clean_body', min_count=2) + embedding = embedding.train(X_train) + + def concatenate_body_header(row): + """Concatenate header content and body content.""" + clean_text = row['clean_header'] + " // " + row['clean_body'] + return clean_text + + X_train['clean_text'] = X_train.apply(concatenate_body_header, axis=1) + + # List of columns containing meta-data + list_meta = ['extension', 'dayofweek', 'hour'] + + # Instanciate a NeuralModel instance with a CNN (imported from the neural_architectures module), an embedding and a list od meta data as arguments + nn_model = NeuralModel(cnn_model, embedding, list_meta = list_meta) + + # Train the NeuralModel + nn_model.fit(X_train.drop(['label'], axis=1), X_train['label']) + + # Make a prediction with the trained model + y_res = nn_model.predict(X_train.drop(['label'], axis=1)) + + +TODO : Describe NeuralModel parameters diff --git a/melusine/__init__.py b/melusine/__init__.py new file mode 100644 index 00000000..90ea7341 --- /dev/null +++ b/melusine/__init__.py @@ -0,0 +1,5 @@ +"""Top-level package for Melusine.""" + +__author__ = """Sacha Samama & Tom Stringer""" +__email__ = 'ssamama@quantmetry.com, tstringer@quantmetry.com' +__version__ = '1.1.3' diff --git a/melusine/config/.gitignore b/melusine/config/.gitignore new file mode 100644 index 00000000..d2127d05 --- /dev/null +++ b/melusine/config/.gitignore @@ -0,0 +1 @@ +*.ini diff --git a/melusine/config/__init__.py b/melusine/config/__init__.py new file mode 100644 index 00000000..d05423af --- /dev/null +++ b/melusine/config/__init__.py @@ -0,0 +1 @@ +from .config import ConfigJsonReader diff --git a/melusine/config/conf.json b/melusine/config/conf.json new file mode 100644 index 00000000..5492307a --- /dev/null +++ b/melusine/config/conf.json @@ -0,0 +1,174 @@ +{ + "words_list": { + "keywords": ["vam", "auto", "voiture", "moto", "bonus", "transfert", "assurance", "assurances", + "pret", "immatriculation", "conducteur", "conduite", "raqvam", "maison", "appartement", + "habitation", "residence", "principale", "secondaire", "logement", "responsabilite", + "vente", "achat", "location", "attestation", "certificat", "cession", "adresse", "demenagement", + "assurance", "enfant", "profession", "voyage", "vacances", "etranger", "financement", + "protection", "juridique", "accident", "corporel", "assistance", "justificatif", "divorce", + "carte", "constat", "contrat", "bulletin", "automobile", "franchise", "mandat", "resiliation", + "devis", "resilier", "vehicule", "autorisation", "responsabilite", "naissance", "garantie", + "conducteur", "epargne", "cotisation", "souscription", "raqvam", "sinistres", "vehicules", + "auto", "deces", "vehicule", "contrat", "cloturer", "attestations", "indemnisation", + "proprietaire", "carte_verte", "carte_grise", "cartes_vertes", "cartes_grises", "sinistre", + "bulletin", "mensualisation", "rib", "cheque", "prelevement", "confidentialite", "rgpd", + "gdpr", "retraite", "questionnaire", "proprietaire"], + "stopwords": ["attached", "see", "file", "flagnsocpp_", "adobe", "reader", "flag_cons_", "flag_mail_", + "flag_url_", "flag_phone_", "flag_amount_", "flag_time_", "flag_immat_", + "flag_cp_", "flag_ndos_", "flag_date_", "tr", "soc", "flag_nsoc_PM", "flag_nsoc_PP" + , "pdf", "to", "cdlt", "cdt"], + "names": ["tom"] + }, + + + "regex": { + + "manage_transfer_reply" : { + + "begin_transfer" : "^[;\\s]*[-\\s]*Transféré par", + "begin_transfer_cons" : "De\\s*:\\s*[^<]*??\\s(?:[;A:]|(?:Envoyé))", + "transfer_other" : "^Le.*,.*a ecrit\\xa0:", + "extract_from" : "De\\s*:\\s*(.*?)\\s(?:[\nA:]|(?:Envoyé))", + "extract_to" : "[ÀA]\\s*:\\s*(.*?)\\s(?:[\n(?:Date)]|(?:Objet))", + "extract_date" : "(?:(?:Envoyé) ?|(?:Date))\\s*:\\s*(.*?)\\s(?:[\n(?:Objet)]|A)", + "extract_header" : "Objet\\s*:\\s*(.*?)\\s[;|\n]", + "answer_header": "RE\\s*:|re\\s*:|Re\\s*:", + "transfer_header": "Tr\\s*:|TR\\s*:|Fwd\\s*:|FW\\s*:|FWD\\s*:|Fw\\s*:" + + }, + + "build_historic" : { + + "transition_list" : [ + "[- ]*?Mail transféré.*?[;|\n]", + "[- ]*?gestionsocietaire@maif.fr a [ée]crit.*?[;|\n]", + "Courriel original.+?Objet\\s*:.+?[;|\n]", + "Transféré par.+?Objet\\s*:.+?[;|\n]", + "Message transmis.+?Objet\\s*:.+?[;|\n]", + "Message transféré.+?Objet\\s*:.+?[;|\n]", + "Message transféré.+?Pour\\s*:.+?[;|\n]", + "Début du message transféré.+?Objet\\s*:.+?[;|\n]", + "Début du message réexpédié.+?Objet\\s*:.+?[;|\n]", + "Début du message transféré.+?Destinataire\\s*:.+?[;|\n]", + "mail transféré.+?Objet\\s*:.+?[;|\n]", + "Forwarded message.+?To\\s*:.+?[;|\n]", + "Message d'origine.+?Objet\\s*:.+?[;|\n]", + "Mail original.+?Objet\\s*:.+?[;|\n]", + "Original Message.+?Subject\\s*:.+?[;|\n]", + "Message original.+?Objet\\s*:.+?[;|\n]", + "Expéditeur.+?Objet\\s*:.+?[;|\n]", + "(?:>?[;|\n]?\\s*(?:Envoyé|De|À|Objet|Cc|Envoyé par|Date|A|Destinataire|Sent|To|Subject|From|Copie à)+?\\s*:\\s*(?:.*?)\\s*[;|\n]\\s*)+", + "En date de.+?écrit", + ">?\\s*Le[^;\n]+?[;|\n]{0,1}[^;\n]+?a[^;\n]+?;{0,1}[^;\n]+?écrit\\s*:?", + ">?\\s*Message de.+?Objet\\s*:.+?[;|\n]", + ">?\\s*Message du.+?Objet\\s*:.+?[;|\n]", + "En date de.+?écrit" + ] + }, + + "cleaning" : { + + "flags_dict": { + + "\\w+ \\w+\/\\w+\/maif" : " flag_cons_ ", + "?" : " flag_mail_ ", + "(?:^| )(?:https?:\/\/)?(?:www\\.)?[-a-zA-Z0-9:%._\\+~#=]{2,256}\\.[a-zA-Z]{2,4}(?:[-a-zA-Z0-9:%_\\+.~#?&\/=]*) " : " flag_url_ ", + "(?:0|0033|\\+33 ?|.?33 ?|\\(0\\))+[1-9].??(?:\\d{2}[ |\\.|-]??){4}" : " flag_phone_ ", + " [A-Z]{2}[- ]?\\d{3}[- ]?[A-Z]{2}|\\d{3,4}[- ]?[A-Z]{2,3}[- ]?\\d{2,3} " : " flag_immat_ ", + "[\\d \\.\\,]+ ?(?:€|EUR|eur|dollar|dollars|\\$)(?:os)?" : " flag_amount_ ", + "(?:^| )\\d{5}(?! flag_amount_|\\d)" : " flag_cp_ ", + "[MF]{1} ??\\d{2} ??\\d{7}[A-Z]{1}" : " flag_ndos_ ", + "\\d{2}[:|H|h]\\d{2}(?::\\d{2})?" : " flag_time_ ", + "(?:\\d{1,4}[\/\\-\\.]){1,2}(?:\\d{2,4})" : " flag_date_ ", + "\\d{1,2} (?:janvier|fevrier|février|mars|avril|mai|juin|juillet|aout?|août?|septembre|octobre|novembre|decembre|décembre|janv?\\.?|fevr?\\.?|févr?\\.?|avr\\.?|juil\\.?|sept?\\.?|oct\\.?|nov\\.?|dec\\.?|déc\\.?) ?(?:\\d{2,4})?" : " flag_date_ " + + }, + + "clean_header_dict" : { + + "^re\\s*:" : "", + "^fw\\s*:" : "", + "^tr\\s*:" : "", + "^fwd\\s*:" : "" + + }, + + "remove_multiple_spaces_list" : ["\\t", "[\\s\\-\\*]{2,}"] + }, + + "mail_segmenting" : { + + "segmenting_dict" : { + + "RE/TR" : [], + + "DISCLAIMER" : ["^.{0,10}Ce message et toutes les pièces jointes sont confidentiels et établis à l'intention exclusive de son ou ses destinataires.*", + "^.{0,10}Si vous avez reçu ce message par erreur, merci d'en avertir immédiatement l'émetteur et de détruire le message.*", + "^.{0,10}Toute modification, édition, utilisation ou diffusion non autorisée est interdite.*", + "^.{0,10}L'émetteur décline toute responsabilité au titre de ce message s'il a été modifié, déformé, falsifié, infecté par un virus ou encore édité ou diffusé sans autorisation.", + "^.{0,10}This message and any attachments are confidential and intended for the named addressee.* only.*", + "^.{0,10}If you have received this message in error, please notify immediately the sender, then delete the message.*", + "^.{0,10}Any unauthorized modification, edition, use or dissemination is prohibited.*", + "^.{0,10}The sender shall not be liable for this message if it has been modified, altered, falsified, infected by a virus or even edited or disseminated without authorization.*" + ], + + "GREETINGS" : ["^.{0,30}cordialement.{0,30}$", + "^.{0,10}cdl?t.{0,16}$", + "^.{0,10}bien [àa] vous.{0,16}$", + "^.{0,30}sentiments mutualistes.{0,16}$", + "^.{0,60}salutations.{0,16}$", + "^.{0,30}expression de (?:nos|mes) sentiments.{0,16}$", + "^.{0,25}bon(?:ne)? (?:soir[ée]e|journ[ée]e|apr[èe]s-midi|courage).{0,16}(?!.)", + "^.{0,10}bonne r[ée]ception.{0,16}$", + "^.{0,30}[aà] votre disposition.{0,45}$", + "^.{0,30}dans l'attente .{0,30}$" + ], + + "PJ" : ["\\(See attached file:.*?\\)", + "\\(Embedded image.*?\\)" + ], + + "FOOTER" : ["(?:.{0,10}courrier électronique|.{0,30}virus|.{0,130}antivirus){2,}", + "Provenance : Courrier pour Windows", + "garanti sans virus.{0,30}", + ".{0,10}www.avg.com", + ".{0,10}www.avast.com", + "(?:Obtenez|Télécharge[zr]) Outlook pour .", + "^.{0,10}La visualisation des fichiers PDF nécessite Adobe Reader.*", + "^.{0,10}Si vous recevez ce message par erreur", + "^.{0,10}Retrouvez-nous sur www\\.maif-\\w+\\.fr", + "afin de contribuer au respect de l'environnement, merci de n'imprimer ce courriel qu'en cas de nécessité", + "^.{0,10}(?:Envoyé|Numérisé|Partagé) de(?:puis)?\\s*(?:mon)?\\s*(?:mobile|smartphone|appareil|iP.|Galaxy|Yahoo|(?:l\\'application))" + ], + + "HELLO" : ["^.{0,16}(?:(?:bonjour)|(?:bonsoir)|(?:madame)|(?:monsieur)|(?:mesdames)|(?:messieurs)).{0,20}(?!.)"], + + "THANKS" : ["^.{0,30}merci.{0,30}(?!.)"] + }, + + "meta_from1" : "De\\s*:\\s*(.*?)\\s\n", + "meta_from2" : "^>?\\s*Le.*,(.*)a écrit", + "meta_to" : "A\\s*:\\s*(.*?)\\s\n", + "meta_date1" : "Date\\s*:\\s*(.*?)\\s\n", + "meta_date2" : "^>?\\s*Le(.*),.*a écrit", + "meta_header" : "Objet\\s*:\\s*(.*?)\\s\n", + "pattern_pj" : "(\\(See attached file:.*?\\)|.\\(Embedded image.*?\\))", + "pattern_separteurs_evidents" : "[?!]|\\.{3}|$|", + "pattern_sep_doubles_points_virgules_espace" : "(?:\\;\\s*){2,}|", + "pattern_exception_une_lettre_maj" : "(?>> conf = ConfigJsonReader() + >>> conf.path_ini_file_ # will return path to the created .ini file + + >>> # I defined my own path otherwise it uses a default conf.json + >>> conf.set_config_path(file_path='path/to/my/conf.json') + >>> conf_dict = conf.get_config_file() + >>> print(conf_dict) # will print the json. + + """ + + def __init__(self): + config_directory = op.dirname(op.abspath(__file__)) + self.path_ini_file_ = op.join(config_directory, 'path.ini') + self.path_default_conf_json_ = op.join(config_directory, 'conf.json') + + if not op.exists(self.path_ini_file_): + logging.info("Create an path.ini file to configure your own config.json") + ini_file = open(self.path_ini_file_, 'w') + conf = ConfigParser() + conf.add_section('PATH') + conf.set('PATH', 'template_config', self.path_default_conf_json_) + conf.write(ini_file) + ini_file.close() + + self.config = ConfigParser() + self.config.read(self.path_ini_file_) + + pass + + def set_config_path(self, file_path=None): + """Set a path for your own `config.json`. + + Parameters + ---------- + file_path: str, optional + Path to the json file. If set to None (default), it will use the default + json located in the built-in package `melusine/config/conf.json`. + """ + if file_path is not None: + # if file_path is specified, it writes new path in the .ini file. + self.config['PATH']['template_config'] = file_path + with open(self.path_ini_file_, 'w') as configfile: + self.config.write(configfile) + pass + + def get_config_file(self): + """Load a config json file from the given path.""" + path = self.config['PATH']['template_config'] + + if path == self.path_default_conf_json_: + config_file = self.load_config_file(path=None) + else: + config_file = self.load_config_file(path=path) + + return config_file + + def reset_config_path(self): + self.config['PATH']['template_config'] = self.path_default_conf_json_ + + pass + + def load_config_file(self, path=None): + """Load Json.""" + # by default it takes native the config.json + if path is None: + path = self.path_default_conf_json_ + + with open(file=path, mode='r') as file: + return json.load(file) + + logging.info("Load config from path: {}.".format(path)) diff --git a/melusine/models/__init__.py b/melusine/models/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/melusine/models/neural_architectures.py b/melusine/models/neural_architectures.py new file mode 100644 index 00000000..2a2c5191 --- /dev/null +++ b/melusine/models/neural_architectures.py @@ -0,0 +1,193 @@ +import keras +from keras.layers import BatchNormalization +from keras.layers import LeakyReLU +from keras.layers import Input +from keras.layers import concatenate +from keras.optimizers import Adam +from keras.models import Model +from keras.layers import Dense +from keras.layers import Conv1D +from keras.layers import Dropout +from keras.layers import SpatialDropout1D +from keras.layers import GlobalMaxPooling1D +from keras.layers import GRU +from keras.layers import Bidirectional + + +def cnn_model(embedding_matrix_init, + ntargets=18, + seq_max=100, + nb_meta=252, + loss='categorical_crossentropy'): + """Pre-defined architecture of a CNN model. + + Parameters + ---------- + embedding_matrix_init : np.array, + Pretrained embedding matrix. + + ntargets : int, optional + Dimension of model output. + Default value, 18. + + seq_max : int, optional + Maximum input length. + Default value, 100. + + nb_meta : int, optional + Dimension of meta data input. + Default value, 252. + + loss : str, optional + Loss function for training. + Default value, 'categorical_crossentropy'. + + Returns + ------- + Model instance + """ + + text_input = Input(shape=(seq_max,), dtype='int32') + + x = keras.layers.Embedding(input_dim=embedding_matrix_init.shape[0], + output_dim=embedding_matrix_init.shape[1], + input_length=seq_max, + weights=[embedding_matrix_init], + trainable=True)(text_input) + x = Conv1D(200, 2, padding='same', activation='linear', strides=1)(x) + x = SpatialDropout1D(0.15)(x) + x = BatchNormalization()(x) + x = LeakyReLU(alpha=0.05)(x) + x = Conv1D(250, 2, padding='same', activation='linear', strides=1)(x) + x = SpatialDropout1D(0.15)(x) + x = LeakyReLU(alpha=0.05)(x) + x = Dropout(0.15)(x) + x = GlobalMaxPooling1D()(x) + x = Dense(250, activation="linear")(x) + x = LeakyReLU(alpha=0.05)(x) + x = Dense(150, activation="linear")(x) + x = Dropout(0.15)(x) + x = LeakyReLU(alpha=0.05)(x) + + if nb_meta == 0: + inputs = text_input + concatenate_2 = x + else: + Meta_input = Input(shape=(nb_meta,), dtype='float32') + inputs = [text_input, Meta_input] + + concatenate_1 = Meta_input + y = Dense(150, activation="linear")(concatenate_1) + y = Dropout(0.2)(y) + y = LeakyReLU(alpha=0.05)(y) + y = Dense(100, activation="linear")(y) + y = Dropout(0.2)(y) + y = LeakyReLU(alpha=0.05)(y) + y = Dense(80, activation="linear")(y) + y = Dropout(0.2)(y) + y = LeakyReLU(alpha=0.05)(y) + concatenate_2 = concatenate([x, y]) + + z = Dense(200, activation="linear")(concatenate_2) + z = Dropout(0.2)(z) + z = LeakyReLU(alpha=0.05)(z) + z = Dense(100, activation="linear")(z) + z = Dropout(0.2)(z) + z = LeakyReLU(alpha=0.05)(z) + outputs = Dense(ntargets, activation='softmax')(z) + + model = Model(inputs=inputs, + outputs=outputs) + + model.compile(optimizer=Adam(), + loss=loss, + metrics=['accuracy']) + + return model + + +def rnn_model(embedding_matrix_init, + ntargets=18, + seq_max=100, + nb_meta=252, + loss='categorical_crossentropy'): + """Pre-defined architecture of a RNN model. + + Parameters + ---------- + embedding_matrix_init : np.array, + Pretrained embedding matrix. + + ntargets : int, optional + Dimension of model output. + Default value, 18. + + seq_max : int, optional + Maximum input length. + Default value, 100. + + nb_meta : int, optional + Dimension of meta data input. + Default value, 252. + + loss : str, optional + Loss function for training. + Default value, 'categorical_crossentropy'. + + Returns + ------- + Model instance + """ + text_input = Input(shape=(seq_max,), dtype='int32') + + x = keras.layers.Embedding(input_dim=embedding_matrix_init.shape[0], + output_dim=embedding_matrix_init.shape[1], + input_length=seq_max, + weights=[embedding_matrix_init], + trainable=True)(text_input) + x = Bidirectional(GRU(80, return_sequences=True))(x) + x = SpatialDropout1D(0.15)(x) + x = Bidirectional(GRU(40, return_sequences=True))(x) + x = SpatialDropout1D(0.15)(x) + x = GlobalMaxPooling1D()(x) + x = Dense(250, activation="linear")(x) + x = LeakyReLU(alpha=0.05)(x) + x = Dense(150, activation="linear")(x) + x = Dropout(0.15)(x) + x = LeakyReLU(alpha=0.05)(x) + + if nb_meta == 0: + inputs = text_input + concatenate_2 = x + else: + Meta_input = Input(shape=(nb_meta,), dtype='float32') + inputs = [text_input, Meta_input] + + concatenate_1 = Meta_input + y = Dense(150, activation="linear")(concatenate_1) + y = Dropout(0.2)(y) + y = LeakyReLU(alpha=0.05)(y) + y = Dense(100, activation="linear")(y) + y = Dropout(0.2)(y) + y = LeakyReLU(alpha=0.05)(y) + y = Dense(80, activation="linear")(y) + y = Dropout(0.2)(y) + y = LeakyReLU(alpha=0.05)(y) + concatenate_2 = concatenate([x, y]) + + z = Dense(200, activation="linear")(concatenate_2) + z = Dropout(0.2)(z) + z = LeakyReLU(alpha=0.05)(z) + z = Dense(100, activation="linear")(z) + z = Dropout(0.2)(z) + z = LeakyReLU(alpha=0.05)(z) + output = Dense(ntargets, activation='softmax')(z) + + model = Model(inputs=inputs, + outputs=output) + + model.compile(optimizer=Adam(), + loss=loss, + metrics=['accuracy']) + + return model diff --git a/melusine/models/train.py b/melusine/models/train.py new file mode 100644 index 00000000..c4859fc6 --- /dev/null +++ b/melusine/models/train.py @@ -0,0 +1,265 @@ +import numpy as np +from collections import Counter +from sklearn.base import BaseEstimator, ClassifierMixin +from keras.utils import np_utils +from keras.models import model_from_json +from keras.preprocessing.text import Tokenizer +from keras.preprocessing.sequence import pad_sequences + + +class NeuralModel(BaseEstimator, ClassifierMixin): + """Generic class for neural models. + + It is compatible with scikit-learn API (i.e. contains fit, transform + methods). + + Parameters + ---------- + neural_architecture_function : function, + Function which returns a Model instance from Keras. + + pretrained_embedding : np.array, + Pretrained embedding matrix. + + text_input_column : str, + Input text column to consider for the model. + + meta_input_list : list, optional + List of the names of the columns containing the metadata. + If empty list or None the model is used without metadata + Default value, ['extension', 'dayofweek', 'hour', 'min']. + + vocab_size : int, optional + Size of vocabulary for neurol network model. + Default value, 25000. + + seq_size : int, optional + Maximum size of input for neural model. + Default value, 100. + + loss : str, optional + Loss function for training. + Default value, 'categorical_crossentropy'. + + batch_size : int, optional + Size of batches for the training of the neural network model. + Default value, 4096. + + n_epochs : int, optional + Number of epochs for the training of the neural network model. + Default value, 15. + + Attributes + ---------- + architecture_function, pretrained_embedding, text_input_column, + meta_input_list, vocab_size, seq_size, loss, batch_size, n_epochs, + + model : Model instance from Keras, + + tokenizer : Tokenizer instance from Keras, + + embedding_matrix : np.array, + Embedding matrix used as input for the neural network model. + + Examples + -------- + >>> from melusine.models.train import NeuralModel + >>> from melusine.models.neural_architectures import cnn_model + >>> from melusine.nlp_tools.embedding import Embedding + >>> pretrained_embedding = Embedding.load() + >>> list_meta = ['extension', 'dayofweek', 'hour'] + >>> nn_model = NeuralModel(cnn_model, pretrained_embedding, list_meta) + >>> nn_model.fit(X_train, y_train) + >>> y_res = nn_model.predict(X_test) + + """ + + def __init__(self, + neural_architecture_function, + pretrained_embedding, + text_input_column='clean_text', + meta_input_list=['extension', 'dayofweek', 'hour', 'min'], + vocab_size=25000, + seq_size=100, + loss='categorical_crossentropy', + batch_size=4096, + n_epochs=15, + **kwargs): + self.architecture_function = neural_architecture_function + self.pretrained_embedding = pretrained_embedding + self.text_input_column = text_input_column + self.meta_input_list = meta_input_list + self.vocab_size = vocab_size + self.seq_size = seq_size + self.loss = loss + self.batch_size = batch_size + self.n_epochs = n_epochs + + def save_nn_model(self, filepath): + """Save model to pickle, json and save weights to .h5.""" + json_model = self.model.to_json() + open(filepath+".json", 'w').write(json_model) + self.model.save_weights(filepath+"_model_weights.h5", overwrite=True) + pass + + def load_nn_model(self, filepath): + """Save model from json and load weights from .h5.""" + model = model_from_json(open(filepath+".json").read()) + model.load_weights(filepath+"_model_weights.h5") + + return model + + def __getstate__(self, filepath): + """Method called before serialization for a specific treatment to save + model weight and structure instead of standard serialization.""" + dict_attr = dict(self.__dict__) + if 'model' in dict_attr: + self.save_nn_model(filepath) + del dict_attr["model"] + return dict_attr + + def __setstate__(self, dict_attr, filepath): + """Method called before loading class for a specific treatment to load + model weight and structure instead of standard serialization.""" + self.__dict__ = dict_attr + self.model = self.load_nn_model(filepath) + + def fit(self, X, y, **kwargs): + """Fit the neural network model on X and y. + If meta_input list is empty list or None the model is used + without metadata. + + Compatible with scikit-learn API. + + Parameters + ---------- + X : pd.DataFrame + + y : pd.Series + + Returns + ------- + self : object + Returns the instance + """ + self._fit_tokenizer(X) + self._create_word_indexes_from_tokens() + self._get_embedding_matrix() + + X_seq = self._prepare_sequences(X) + X_meta, nb_meta_features = self._get_meta(X) + y_categorical = np_utils.to_categorical(y) + nb_labels = len(np.unique(y)) + + self.model = self.architecture_function( + embedding_matrix_init=self.embedding_matrix, + ntargets=nb_labels, + seq_max=self.seq_size, + nb_meta=nb_meta_features, + loss=self.loss) + + if nb_meta_features == 0: + X_input = X_seq + else: + X_input = [X_seq, X_meta] + + self.model.fit(X_input, + y_categorical, + batch_size=self.batch_size, + epochs=self.n_epochs, + **kwargs) + pass + + def predict(self, X, **kwargs): + """Returns the class predicted. + + Parameters + ---------- + X : pd.DataFrame + + Returns + ------- + int + """ + return np.argmax(self.predict_proba(X, **kwargs), axis=1) + + def predict_proba(self, X, **kwargs): + """Returns the probabilities associated to each classes. + If meta_input list is empty list or None the model is used + without metadata. + + Parameters + ---------- + X : pd.DataFrame + + Returns + ------- + np.array + """ + X_seq = self._prepare_sequences(X) + X_meta, nb_meta_features = self._get_meta(X) + if nb_meta_features == 0: + X_input = X_seq + else: + X_input = [X_seq, X_meta] + return self.model.predict(X_input, **kwargs) + + def _fit_tokenizer(self, X): + """Fit a Tokenizer instance from Keras on a clean body.""" + self.tokenizer = Tokenizer(num_words=self.vocab_size, + oov_token='UNK') + self.tokenizer.fit_on_texts(X[self.text_input_column]) + pass + + def _create_word_indexes_from_tokens(self): + """Create a word indexes dictionary from tokens.""" + c = Counter(self.tokenizer.word_counts) + self.tokenizer.word_index = {t[0]: i + 1 for i, t + in enumerate(c.most_common(len(c)))} + self.tokenizer.word_index['UNK'] = 0 + pass + + def _get_embedding_matrix(self): + """Prepares the embedding matrix to be used as an input for + the neural network model.""" + pretrained_embedding = self.pretrained_embedding + vocab_size = self.vocab_size + vector_dim = pretrained_embedding.embedding.vector_size + wv_dict = {word: vec for word, vec in + zip(pretrained_embedding.embedding.wv.index2word, + pretrained_embedding.embedding.wv.syn0)} + embedding_matrix = np.zeros((vocab_size+1, vector_dim)) + for word, index in self.tokenizer.word_index.items(): + if index >= vocab_size: + continue + embedding_vector = wv_dict.get(word) + if embedding_vector is not None: + embedding_matrix[index] = embedding_vector + self.embedding_matrix = embedding_matrix + pass + + def _prepare_sequences(self, X): + """Prepares the sequence to be used as input for the neural network + model.""" + seqs = self.tokenizer.texts_to_sequences(X[self.text_input_column]) + X_seq = pad_sequences(seqs, maxlen=self.seq_size) + + return X_seq + + def _get_meta(self, X): + """Returns as a pd.DataFrame the metadata from X given the list_meta + defined, and returns the number of columns. If meta_input_list is + empty list or None, meta_input_list is returned as 0.""" + if self.meta_input_list is None or self.meta_input_list == []: + X_meta = None + nb_meta_features = 0 + else: + meta_input_list = self.meta_input_list + meta_input_list = [col+'__' for col in meta_input_list] + columns_list = list(X.columns) + meta_columns_list = [col for col in columns_list + if col.startswith(tuple(meta_input_list))] + X_meta = X[meta_columns_list] + nb_meta_features = len(meta_columns_list) + + return X_meta, nb_meta_features diff --git a/melusine/nlp_tools/__init__.py b/melusine/nlp_tools/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/melusine/nlp_tools/embedding.py b/melusine/nlp_tools/embedding.py new file mode 100644 index 00000000..af902fbb --- /dev/null +++ b/melusine/nlp_tools/embedding.py @@ -0,0 +1,89 @@ +import logging +from gensim.models import Word2Vec +from melusine.utils.streamer import Streamer + +log = logging.getLogger('Embeddings') +log.setLevel(logging.DEBUG) +formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s', + datefmt='%d/%m %I:%M') + + +class Embedding(): + """Class to train embeddings with Word2Vec algorithm. + + Attributes + ---------- + input_column : str, + Input text column to consider for the embedding. + + stream : Streamer instance, + Builds a stream a tokens from a pd.Dataframe to train the embeddings. + + embedding : Word2Vec instance from Gensim + + Examples + -------- + >>> from melusine.nlp_tools.embedding import Embedding + >>> embedding = Embedding() + >>> embedding.train(X) + >>> embedding.save(filepath) + >>> embedding = Embedding().load(filepath) + + """ + + def __init__(self, + input_column='clean_text', + workers=40, + seed=42, + iter=15, + size=300, + window=5, + min_count=100): + self.logger = logging.getLogger('NLUtils.Embedding') + ch = logging.StreamHandler() + ch.setLevel(logging.INFO) + ch.setFormatter(formatter) + self.logger.addHandler(ch) + self.logger.debug('Create an Embedding instance.') + self.input_column = input_column + self.streamer = Streamer(columns=self.input_column) + self.workers = workers + self.seed = seed + self.iter = iter + self.size = size + self.window = window + self.min_count = min_count + + def save(self, filepath): + """Method to save Embedding object.""" + self.embedding.save(filepath) + + def load(self, filepath): + """Method to load Embedding object.""" + self.embedding = Word2Vec.load(filepath) + return self + + def train(self, X): + """Train embeddings with Word2Vec algorithm. + + Parameters + ---------- + X : pd.Dataframe + Containing a clean body column. + + Returns + ------- + self : object + Returns the instance + """ + self.logger.info('Start training for embedding') + self.streamer.to_stream(X) + self.embedding = Word2Vec(self.streamer.stream, + workers=self.workers, + seed=self.seed, + iter=self.iter, + size=self.size, + window=self.window, + min_count=self.min_count) + self.logger.info('Done.') + pass diff --git a/melusine/nlp_tools/phraser.py b/melusine/nlp_tools/phraser.py new file mode 100644 index 00000000..4137645e --- /dev/null +++ b/melusine/nlp_tools/phraser.py @@ -0,0 +1,255 @@ +import sys +import logging +import gensim +import nltk +import pickle +import re +from melusine.utils.streamer import Streamer + + +regex_tokenize_with_punctuations = r"(.*?[\s'])" +tokenize_without_punctuations = r"(.*?)[\s']" +regex_process = "\w+(?:[\?\-\'\"_]\w+)*" +regex_split_parts = r"(.*?[;.,?!])" + +common_terms = nltk.corpus.stopwords.words("french") + +log = logging.getLogger('nlp_tools') +log.setLevel(logging.DEBUG) +formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s \ + - %(message)s', datefmt='%d/%m %I:%M') + + +def phraser_on_body(row, phraser): + """Applies phraser on cleaned body. + + To be used with methods such as: `apply(func, axis=1)` or + `apply_by_multiprocessing(func, axis=1, **kwargs)`. + + Parameters + ---------- + row : row of pd.Dataframe + + phraser : Phraser instance, + + Returns + ------- + pd.Series + + Examples + -------- + >>> import pandas as pd + >>> data = pd.read_pickle('./tutorial/data/emails_anonymized.pickle') + >>> from melusine.nlp_tools.phraser import phraser_on_body + >>> from melusine.nlp_tools.phraser import Phraser + >>> # data contains a 'clean_body' column + >>> phraser = Phraser(columns='clean_body').load(filepath) + >>> data.apply(phraser_on_body, axis=1) # apply to all samples + + """ + clean_body = phraser_on_text(row["clean_body"], phraser) + + return clean_body + + +def phraser_on_header(row, phraser): + """Applies phraser on cleaned header. + + To be used with methods such as: `apply(func, axis=1)` or + `apply_by_multiprocessing(func, axis=1, **kwargs)`. + + Parameters + ---------- + row : row of pd.Dataframe + + phraser : Phraser instance, + + Returns + ------- + pd.Series + + Examples + -------- + >>> import pandas as pd + >>> data = pd.read_pickle('./tutorial/data/emails_anonymized.pickle') + >>> from melusine.nlp_tools.phraser import phraser_on_header + >>> from melusine.nlp_tools.phraser import Phraser + >>> # data contains a 'clean_header' column + >>> phraser = Phraser(columns='clean_header').load(filepath) + >>> data.apply(phraser_on_header, axis=1) # apply to all samples + + """ + clean_header = phraser_on_text(row["clean_header"], phraser) + + return clean_header + + +def phraser_on_text(text, phraser): + """Returns text with phrased words. + + Parameters + ---------- + text : str, + + phraser : Phraser instance, + + Returns + ------- + str + + """ + if not re.search(pattern=r"\W*\b\w+\b\W*", string=text): + return text + pre_typos_list, words_list, separators_list = _split_typos_words_separators(text) + phrased_words_list = phraser.phraser[words_list] + phrased_text = _rebuild_phrased_text_with_punctuation(pre_typos_list, + words_list, + separators_list, + phrased_words_list) + + return phrased_text + + +def _rebuild_phrased_text_with_punctuation(pre_typos_list, + words_list, + separators_list, + phrased_words_list): + """Rebuilds the initial text with phrased words.""" + i = 0 + for pre_typo, word, separator in zip(pre_typos_list, + words_list, + separators_list): + phrased_word = re.sub("\W", "", phrased_words_list[i]) + word = re.sub("\W", "", word) + if len(phrased_word) > len(word): + if _check_last_word_phrased(phrased_word, word): + phrased_words_list[i] = pre_typo + phrased_word + separator + i += 1 + else: + phrased_words_list[i] = pre_typo + phrased_word + separator + i += 1 + + return "".join(phrased_words_list) + + +def _check_last_word_phrased(phrased_word, word): + """Check if a word is the last word of a phrased word.""" + words_list = phrased_word.split("_") + last_word = words_list[-1] + + return word == last_word + + +def _split_typos_words_separators(text, pattern=r"(\W*)\b(\w+)\b(\W*)"): + """Split text according to typos.""" + tuple_word_separator_list = re.findall(pattern, text, flags=re.M | re.I) + pre_typos_list, words_list, separators_list = zip(*tuple_word_separator_list) + + return pre_typos_list, words_list, separators_list + + +class Phraser(): + """Class to train a phraser. + + Parameters + ---------- + input_column : str, + Input text column to consider for the phraser. + + common_terms : list of integers, optional + List of stopwords. + Default value, list of stopwords from nltk. + + threshold : int, optional + Threshold to select colocations. + Default value, 350. + + min_count : int, optional + Minimum count of word to be selected as colocation. + Default value, 200. + + Attributes + ---------- + common_terms, threshold, min_count, + + stream : Streamer object, + Builds a stream a tokens from a pd.Dataframe to train the embeddings. + + phraser : Phraser object from Gensim + + Examples + -------- + >>> from melusine.nlp_tools.phraser import Phraser + >>> phraser = Phraser() + >>> phraser.train(X) + >>> phraser.save(filepath) + >>> phraser = phraser().load(filepath) + + """ + + def __init__(self, + input_column='clean_body', + common_terms=common_terms, + threshold=350, + min_count=200): + self.logger = logging.getLogger('NLUtils.Phraser') + self.logger.debug('creating a Phraser instance') + self.common_terms = common_terms + self.threshold = threshold + self.min_count = min_count + self.input_column = input_column + self.streamer = Streamer(columns=self.input_column) + ch = logging.StreamHandler(sys.stdout) + ch.setLevel(logging.INFO) + ch.setFormatter(formatter) + self.logger.addHandler(ch) + + def __getstate__(self): + """should return a dict of attributes that will be pickled + To override the default pickling behavior and + avoid the pickling of the logger + """ + d = self.__dict__.copy() + if 'logger' in d: + d['logger'] = d['logger'].name + return d + + def __setstate__(self, d): + """To override the default pickling behavior and + avoid the pickling of the logger""" + if 'logger' in d: + d['logger'] = logging.getLogger(d['logger']) + self.__dict__.update(d) + + def save(self, filepath): + """Method to save Phraser object""" + with open(filepath, 'wb') as f: + pickle.dump(self.phraser, f) + + def load(self, filepath): + """Method to load Phraser object""" + with open(filepath, 'rb') as f: + self.phraser = pickle.load(f) + return self + + def train(self, X): + """Train phraser. + + Parameters + ---------- + X : pd.Dataframe + + Returns + ------- + self : object + Returns the instance + """ + self.logger.info('Start training for colocation detector') + self.streamer.to_stream(X) + phrases = gensim.models.Phrases(self.streamer.stream, + common_terms=self.common_terms, + threshold=self.threshold, + min_count=self.min_count) + self.phraser = gensim.models.phrases.Phraser(phrases) + self.logger.info('Done.') + pass diff --git a/melusine/nlp_tools/tokenizer.py b/melusine/nlp_tools/tokenizer.py new file mode 100644 index 00000000..a4ff40aa --- /dev/null +++ b/melusine/nlp_tools/tokenizer.py @@ -0,0 +1,133 @@ +import logging +import nltk +from sklearn.base import BaseEstimator, TransformerMixin +from melusine.config.config import ConfigJsonReader + +conf_reader = ConfigJsonReader() +config = conf_reader.get_config_file() + +newStopWords = config["words_list"]["stopwords"] +stopwords = nltk.corpus.stopwords.words('french') +stopwords.extend(newStopWords) + +regex_tokenize = "\w+(?:[\?\-\"_]\w+)*" + + +class Tokenizer(BaseEstimator, TransformerMixin): + """Class to train and apply tokenizer. + + Compatible with scikit-learn API (i.e. contains fit, transform methods). + + Parameters + ---------- + input_column : str, + Input text column to consider for the tokenizer. + + stopwords : list of strings, optional + List of words to remove from list of tokens. + Default value, list defined in conf file + + stop_removal : boolean, optional + True if stopwords to be removed, else False. + Default value, False. + + n_jobs : int, optional + Number of cores used for computation. + Default value, 20. + + Attributes + ---------- + stopwords, stop_removal, n_jobs + + Examples + -------- + >>> from melusine.nlp_tools.tokenizer import Tokenizer + >>> tokenizer = Tokenizer() + >>> X = tokenizer.fit_transform(X) + >>> tokenizer.save(filepath) + >>> tokenizer = Tokenizer().load(filepath) + + """ + + def __init__(self, + input_column='clean_text', + stopwords=stopwords, + stop_removal=True, + n_jobs=20): + self.input_column = input_column + self.stopwords = stopwords + self.stop_removal = stop_removal + self.n_jobs = n_jobs + self.logger = logging.getLogger('emails_application.preprocessing.Preprocessing') + self.logger.debug('creating an instance of Preprocessing') + + def __getstate__(self): + """should return a dict of attributes that will be pickled + To override the default pickling behavior and + avoid the pickling of the logger + """ + d = self.__dict__.copy() + d['n_jobs'] = 1 + if 'logger' in d: + d['logger'] = d['logger'].name + return d + + def __setstate__(self, d): + """To override the default pickling behavior and + avoid the pickling of the logger""" + if 'logger' in d: + d['logger'] = logging.getLogger(d['logger']) + self.__dict__.update(d) + + def fit(self, X, y=None): + """Unused method. Defined only for compatibility with scikit-learn API. + """ + return self + + def transform(self, X): + """Applies tokenize method on pd.Dataframe. + + Parameters + ---------- + X : pandas.DataFrame, + Data on which transformations are applied. + + Returns + ------- + pandas.DataFrame + """ + self.logger.debug('Start transform tokenizing') + X['tokens'] = X[[self.input_column]].apply(self.tokenize, axis=1) + X['tokens'] = X['tokens'].apply(lambda x: x[0]) + self.logger.info('X shape : %s' % str(X.shape)) + self.logger.debug('Done.') + return X + + def tokenize(self, row): + """Returns list of tokens. + + Parameters + ---------- + row : row of pd.Dataframe + + Returns + ------- + pd.Series + + """ + text = row[self.input_column] + tokens = self._tokenize(text) + tokens = self._remove_stopwords(tokens) + return [tokens] + + def _tokenize(self, text, pattern=regex_tokenize): + """Returns list of tokens from text.""" + return nltk.tokenize.regexp_tokenize(str(text), pattern=pattern) + + def _remove_stopwords(self, list): + """ Removes stopwords from list if stop_removal parameter + set to True.""" + if self.stop_removal: + return [x for x in list if x not in self.stopwords] + else: + return list diff --git a/melusine/prepare_email/__init__.py b/melusine/prepare_email/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/melusine/prepare_email/body_header_extraction.py b/melusine/prepare_email/body_header_extraction.py new file mode 100644 index 00000000..8bdd647f --- /dev/null +++ b/melusine/prepare_email/body_header_extraction.py @@ -0,0 +1,62 @@ +def extract_last_body(row): + """Extracts the body from the last message of the conversation. + The conversation is structured as a dictionary. + + To be used with methods such as: `apply(func, axis=1)` or + `apply_by_multiprocessing(func, axis=1, **kwargs)`. + + Parameters + ---------- + message_dict : dict + + Returns + ------- + str + + """ + last_message_dict = row["structured_body"][0] + last_body = extract_body(last_message_dict) + + return last_body + + +def extract_body(message_dict): + """Extracts the body from a message dictionary. + + Parameters + ---------- + message_dict : dict + + Returns + ------- + str + + """ + tagged_parts_list = message_dict["structured_text"]["text"] + body = "" + for part_tag_dict in tagged_parts_list: + part = part_tag_dict["part"] + tag = part_tag_dict["tags"] + if tag == 'BODY': + body += part + " " + elif tag == 'GREETINGS': + break + + return body + + +def extract_header(message_dict): + """Extracts the header from a message dictionary. + + Parameters + ---------- + message_dict : dict + + Returns + ------- + str + + """ + header = message_dict["structured_text"]["header"] + + return header diff --git a/melusine/prepare_email/build_historic.py b/melusine/prepare_email/build_historic.py new file mode 100644 index 00000000..cff0d95b --- /dev/null +++ b/melusine/prepare_email/build_historic.py @@ -0,0 +1,88 @@ +import re +from melusine.config import ConfigJsonReader + +conf_reader = ConfigJsonReader() +config = conf_reader.get_config_file() +regex_transition_list = config['regex']['build_historic']['transition_list'] + + +def build_historic(row): + """Rebuilds and structures historic of emails from the whole contents. + Function has to be applied with `apply` method of a DataFrame along an + axis=1. + For each email of the historic, it segments the body into 2 different parts + (2 keys of dict): + + {'text': extract raw text without metadata, + 'meta': get transition from the 'transition_list' defined in the conf.json + }. + + + Parameters + ---------- + row : row, + A pandas.DataFrame row object with 'body' column. + + Returns + ------- + list + + Examples + -------- + >>> import pandas as pd + >>> data = pd.read_pickle('./tutorial/data/emails_anonymized.pickle') + >>> # data contains a 'body' column + + >>> from melusine.prepare_email.build_historic import build_historic + >>> build_historic(data.iloc[0]) # apply for 1 sample + >>> data.apply(build_historic, axis=1) # apply to all samples + + """ + email_body = row['body'] + index_messages, nb_messages = _get_index_transitions(email_body) + structured_historic = [ + {'text': email_body[index_messages[i][1]:index_messages[i+1][0]], + 'meta': email_body[index_messages[i][0]:index_messages[i][1]] + } for i in range(nb_messages)] + + return structured_historic + + +def _get_index_transitions(email_body): + """Returns list of indexes defining the transitions between + different messages in an email.""" + index = [] + for regex in regex_transition_list: + for match in re.finditer(regex, email_body): + idx = (match.start(), match.end()) + index.append(idx) + + index = [(0, 0)] + index + index = index + [(len(email_body), len(email_body))] + index = list(set(index)) + index = sorted(index, key=lambda tup: tup[0]) + index = __filter_overlap(index) + nb_parts = len(index) - 1 + + return index, nb_parts + + +def __filter_overlap(index): + """Filters indexes in list if they overlap.""" + if len(index) == 2: + return index + index_f = [] + i = 0 + j = i + 1 + while j < len(index): + if index[i][1] > index[j][0]: + index[i] = (min(index[i][0], index[j][1]), + max(index[i][0], index[j][1])) + j += 1 + else: + index_f += [index[i]] + i = j + j += 1 + index_f += [index[i]] + + return index_f[:i+1] diff --git a/melusine/prepare_email/cleaning.py b/melusine/prepare_email/cleaning.py new file mode 100644 index 00000000..f758d526 --- /dev/null +++ b/melusine/prepare_email/cleaning.py @@ -0,0 +1,183 @@ +""" +Cleaning of the body and the header +""" + +import unidecode +import re +from melusine.config import ConfigJsonReader + +conf_reader = ConfigJsonReader() +config = conf_reader.get_config_file() +REGEX_CLEAN = config["regex"]['cleaning'] +regex_flags_dict = REGEX_CLEAN["flags_dict"] +regex_clean_header_dict = REGEX_CLEAN["clean_header_dict"] +regex_remove_multiple_spaces_list = REGEX_CLEAN["remove_multiple_spaces_list"] + + +def clean_body(row, flags=True): + """Clean body column. The cleaning involves the following operations: + - Cleaning the text + - Removing the multiple spaces + - Flagging specific items (postal code, phone number, date...) + + Parameters + ---------- + row : row of pandas.Dataframe object, + Data contains 'last_body' column. + + flags : boolean, optional + True if you want to flag relevant info, False if not. + Default value, True. + + Returns + ------- + row of pandas.DataFrame object or pandas.Series if apply to all DF. + """ + text = str(row["last_body"]) + clean_body = clean_text(text) + clean_body = flag_items(clean_body, flags=flags) + return clean_body + + +def clean_header(row, flags=True): + """Clean the header column. The cleaning involves the following operations: + - Removing the transfers and answers indicators + - Cleaning the text + - Flagging specific items (postal code, phone number, date...) + + Parameters + ---------- + row : row of pandas.Dataframe object, + Data contains 'header' column. + + flags : boolean, optional + True if you want to flag relevant info, False if not. + Default value, True. + + Returns + ------- + row of pd.DataFrame object or pandas.Series if apply to all DF. + """ + text = str(row["header"]) + clean_header = remove_transfer_answer_header(text) + clean_header = clean_text(clean_header) + clean_header = flag_items(clean_header, flags=flags) + return clean_header + + +def clean_text(text): + """Clean a string. The cleaning involves the following operations: + - Putting all letters to lowercase + - Removing all the accents + - Removing all line breaks + - Removing all symbols and punctuations + - Removing the multiple spaces + + Parameters + ---------- + text : str + + Returns + ------- + str + """ + text = text_to_lowercase(text) + text = remove_accents(text) + text = remove_line_break(text) + text = remove_superior_symbol(text) + text = remove_apostrophe(text) + text = remove_multiple_spaces_and_strip_text(text) + return text + + +def text_to_lowercase(text): + """Set all letters to lowercase""" + return text.lower() + + +def remove_accents(text): + """Remove accents from text""" + return unidecode.unidecode(text) + + +def remove_line_break(text): + """Remove line breaks from text""" + return text.replace('\n', '') + + +def remove_superior_symbol(text): + """Remove superior and inferior symbols from text""" + text = text.replace('>', '') + text = text.replace('<', '') + return text + + +def remove_apostrophe(text): + """Remove apostrophes from text""" + return text.replace('\'', ' ') + + +def remove_multiple_spaces_and_strip_text(text): + """Remove multiple spaces, strip text, and remove '-', '*' characters. + + Parameters + ---------- + text : str, + Header content. + + Returns + ------- + str + + """ + for regex_remove_multiple_spaces in regex_remove_multiple_spaces_list: + text = re.sub(regex_remove_multiple_spaces, ' ', text) + text = text.strip() + return text + + +def flag_items(text, flags=True): + """Flag relevant information + ex : amount, phone number, email address, postal code (5 digits).. + + Parameters + ---------- + text : str, + Body content. + + flags : boolean, optional + True if you want to flag relevant info, False if not. + Default value, True. + + Returns + ------- + str + + """ + if flags: + for regex, value in regex_flags_dict.items(): + text = re.sub(pattern=regex, repl=value, + string=text, flags=re.IGNORECASE) + return text + else: + return text + + +def remove_transfer_answer_header(text): + """Remove historic and transfers indicators in the header. + Ex: "Tr:", "Re:", "Fwd", etc. + + Parameters + ---------- + text : str, + Header content. + + Returns + ------- + str + + """ + for regex, value in regex_clean_header_dict.items(): + text = re.sub(pattern=regex, repl=value, + string=text, flags=re.IGNORECASE) + return text diff --git a/melusine/prepare_email/mail_segmenting.py b/melusine/prepare_email/mail_segmenting.py new file mode 100644 index 00000000..7b155d8b --- /dev/null +++ b/melusine/prepare_email/mail_segmenting.py @@ -0,0 +1,383 @@ +import re +from unidecode import unidecode +from melusine.config.config import ConfigJsonReader + +conf_reader = ConfigJsonReader() +config = conf_reader.get_config_file() +REGEX_TR_RE = config['regex']['manage_transfer_reply'] +REGEX_SEG = config['regex']['mail_segmenting'] + +regex_begin_transfer = REGEX_TR_RE['begin_transfer'] +regex_transfer_other = REGEX_TR_RE['transfer_other'] +regex_extract_from = REGEX_TR_RE['extract_from'] +regex_extract_to = REGEX_TR_RE['extract_to'] +regex_extract_date = REGEX_TR_RE['extract_date'] +regex_extract_header = REGEX_TR_RE['extract_header'] +regex_answer_header = REGEX_TR_RE['answer_header'] +regex_transfert_header = REGEX_TR_RE['transfer_header'] + +regex_segmenting_dict = REGEX_SEG['segmenting_dict'] +regex_segmenting_dict['RE/TR'] = [regex_begin_transfer, + regex_transfer_other, + regex_extract_from, + regex_extract_to, + regex_extract_date, + regex_extract_header, + regex_answer_header, + regex_transfert_header] + +regex_from1 = REGEX_SEG['meta_from1'] +regex_from2 = REGEX_SEG['meta_from2'] +regex_to = REGEX_SEG['meta_to'] +regex_date1 = REGEX_SEG['meta_date1'] +regex_date2 = REGEX_SEG['meta_date2'] +regex_header = REGEX_SEG['meta_header'] +regex_piece_jointe = REGEX_SEG['pattern_pj'] + +regex_exception_une_lettre_maj = REGEX_SEG['pattern_exception_une_lettre_maj'] +regex_exception_Mr = REGEX_SEG['pattern_exception_Mr'] +regex_exception_Dr = REGEX_SEG['pattern_exception_Dr'] +regex_exception_Mme = REGEX_SEG['pattern_exception_Mme'] +regex_exception = REGEX_SEG['pattern_exception'] +regex_pattern_exceptions = (regex_exception_une_lettre_maj + + regex_exception_Mr + + regex_exception_Dr + + regex_exception_Mme + + regex_exception) + +regex_sep_doubles_points_virgules_espace = REGEX_SEG['pattern_sep_doubles_points_virgules_espace'] +regex_pattern_separteurs_evidents = REGEX_SEG['pattern_separteurs_evidents'] +regex_pattern_beginning = REGEX_SEG['pattern_beginning'] +regex_pattern_end = REGEX_SEG['pattern_end'] +regex_pattern = (regex_pattern_beginning + + regex_pattern_separteurs_evidents + + regex_sep_doubles_points_virgules_espace + + regex_pattern_exceptions + + regex_pattern_end) + +regex_typo = REGEX_SEG['tag_typo'] +regex_tag = REGEX_SEG['tag'] +regex_tag_subsentence = REGEX_SEG['tag_subsentence'] +regex_split_message_to_sentences_list = REGEX_SEG['split_message_to_sentences_list'] + + +def structure_email(row): + """ 1. Splits parts of each messages in historic and tags them. + For example a tag can be hello, body, greetings etc + 2. Extracts the meta informations of each messages + + To be used with methods such as: `apply(func, axis=1)` or + `apply_by_multiprocessing(func, axis=1, **kwargs)`. + + Parameters + ---------- + row : row of pd.Dataframe, apply on column ['structured_historic'] + + Returns + ------- + list of dicts : one dict per message + + Examples + -------- + >>> import pandas as pd + >>> from melusine.prepare_email.build_historic import build_historic + >>> data = pd.read_pickle('./tutorial/data/emails_anonymized.pickle') + >>> data['structured_historic'] = data.apply(build_historic, axis=1) + >>> # data contains column ['structured_historic'] + + >>> from melusine.prepare_email.mail_segmenting import structure_email + >>> structure_email(data.iloc[0]) # apply for 1 sample + >>> data.apply(structure_email, axis=1) # apply to all samples + + """ + structured_body = [] + for message in row['structured_historic']: + structured_message = structure_message(message) + structured_body.append(structured_message) + + return structured_body + + +def structure_message(message): + """ Splits parts of a message and tags them. + For example a tag can be hello, body, greetings etc + Extracts the meta informations of the message + + Parameters + ---------- + message : dict + + Returns + ------- + dict + + Examples + -------- + """ + meta = str(message.get("meta")) + structured_meta, header = structure_meta(meta) + text = str(message.get("text")) + tagged_parts_list = tag_parts_message(text) + structured_message = _tuples_to_dict(structured_meta, + header, + tagged_parts_list) + + return structured_message + + +def structure_meta(meta): + """ Extract meta informations (date, from, to, header) from string meta + + Parameters + ---------- + meta : str + + Returns + ------- + tuple(dict, string) + + Examples + -------- + """ + structured_meta = {} + structured_meta['date'] = _find_date(meta) + structured_meta['from'] = _find_from(meta) + structured_meta['to'] = _find_meta(regex_to, meta) + header = _find_meta(regex_header, meta) + + return structured_meta, header + + +def _find_date(message): + """ Match pattern regex with a given message """ + group = _find_meta(regex_date1, message) + if group is None: + group = _find_meta(regex_date2, message) + + return group + + +def _find_from(message): + """ Match pattern regex with a given message """ + group = _find_meta(regex_from1, message) + if group is None: + group = _find_meta(regex_from2, message) + + return group + + +def _find_meta(regex, message): + """ Match pattern regex with a given message """ + groups = re.findall(regex, message) + if len(groups) < 1: + return None + else: + return groups[0] + + +def tag_parts_message(text): + """ Splits message into sentences, tags them and merges two sentences in a + row having the same tag. + + Parameters + ---------- + text : str, + + + Returns + ------- + list of tuples + + Examples + -------- + """ + sentence_list = split_message_to_sentences(text) + tagged_sentence_list = [] + for sentence in sentence_list: + tagged_sentence = tag_sentence(sentence) + tagged_sentence_list.extend(tagged_sentence) + tagged_parts_list = _merge_parts(tagged_sentence_list) + tagged_parts_list = _remove_empty_parts(tagged_parts_list) + tagged_parts_list = _update_typo_parts(tagged_parts_list) + tagged_parts_list = _remove_typo_parts(tagged_parts_list) + + return tagged_parts_list + + +def split_message_to_sentences(text, sep_=r"(.*?[;.,?!])"): + """ Split each sentences in a text """ + regex1 = regex_split_message_to_sentences_list[0] + regex2 = regex_split_message_to_sentences_list[1] + regex3 = regex_split_message_to_sentences_list[2] + regex4 = regex_split_message_to_sentences_list[3] + text = text.strip(regex1).lstrip(regex2) + text = re.sub(regex3, regex4, text) # remove double punctuation + sentence_list = re.findall(regex_pattern, text, flags=re.M) + sentence_list = [r for s in sentence_list + for r in re.split(regex_piece_jointe, s) if r] + + return sentence_list + + +def tag_sentence(sentence, default='BODY'): + """ Tag a sentence. + If the sentence cannot be tagged it will tag the subsentences + + Parameters + ---------- + sentence : str, + + + Returns + ------- + list of tuples : sentence, tag + + Examples + -------- + """ + tagged_sentence, tagged = tag(sentence) + if tagged: + return tagged_sentence + else: + return _tag_subsentence(sentence) + + +def _tag_subsentence(sentence, default='BODY'): + """ Tags the subsentences in a sentence. + If the subsentences cannot be tagged it will return the whole sentence with + a default tag. + + Parameters + ---------- + sentence : str, + + + Returns + ------- + list of tuples : sentence, tag + + Examples + -------- + """ + subsentence_list = re.findall(regex_tag_subsentence, sentence, flags=re.M) + tagged_subsentence_list = [] + any_sub_catch = False + for subsentence in subsentence_list: + tagged_subsentence, subcatch = tag(subsentence) + if subcatch: + tagged_subsentence_list.extend(tagged_subsentence) + any_sub_catch = True + else: + tagged_subsentence_list.append((subsentence, default)) + if any_sub_catch: + return tagged_subsentence_list + else: + return [(sentence, default)] + + +def tag(string): + """ Tags a string. + + Parameters + ---------- + string : str, + + + Returns + ------- + tuples : list of tuples and boolean + + Examples + -------- + """ + def _remove_accents(string): + return unidecode(string) + regex_parts = regex_segmenting_dict.items() + sentence_with_no_accent = _remove_accents(string) + for k, reg in regex_parts: + for r in reg: + r = _remove_accents(r) + r = r.replace(" ", regex_tag) + if re.search(r, sentence_with_no_accent, re.I): + return [(string, k)], True + + return string, False + + +def _merge_parts(list_de_tuple_parts_id): + """ Merge two consecutives strings with the same tag """ + if len(list_de_tuple_parts_id) <= 1: + return list_de_tuple_parts_id + i = 0 + j = 1 + sentences, tags = zip(*list_de_tuple_parts_id) + tags = list(tags) + sentences = list(sentences) + while j < len(list_de_tuple_parts_id): + if tags[i] == tags[j]: + sentences[i] = " ".join((sentences[i], sentences[j])) + j += 1 + else: + i += 1 + tags[i] = tags[j] + sentences[i] = sentences[j] + j += 1 + list_de_tuples_merged = list(zip(sentences[:i+1], tags[:i+1])) + + return list_de_tuples_merged + + +def _remove_empty_parts(tagged_parts_list): + """ Remove all the empty parts in the list of tagged parts """ + tagged_parts_list = [part for part in tagged_parts_list + if len(part[0]) > 0] + + return tagged_parts_list + + +def _update_typo_parts(tagged_parts_list): + """ Update the tagging for all the typo parts in the list of + tagged parts """ + tagged_parts_list = [_update_typo_part(part_tag_tuple) + for part_tag_tuple in tagged_parts_list] + + return tagged_parts_list + + +def _update_typo_part(part_tag_tuple): + part, tag = part_tag_tuple + if __is_typo(part): + part_tag_tuple = part, "TYPO" + + return part_tag_tuple + + +def __is_typo(part, regex_typo=regex_typo): + """ Check if a string is typo """ + return re.search(regex_typo, part, re.I & re.M) + + +def _remove_typo_parts(tagged_parts_list): + """ """ + tagged_parts_list = [part_tag_tuple + for part_tag_tuple in tagged_parts_list + if part_tag_tuple[1] != "TYPO"] + + return tagged_parts_list + + +def _tuples_to_dict(meta, header, tagged_parts): + """ Convert a dictionnary and list of tuples into dictionnary """ + structured_message = {} + structured_message["meta"] = meta + structured_message["structured_text"] = {} + structured_message["structured_text"]["header"] = header + structured_text = [] + for part, tag in tagged_parts: + dict_message = {} + dict_message["part"] = part + dict_message["tags"] = tag + structured_text.append(dict_message) + structured_message["structured_text"]["text"] = structured_text + + return structured_message diff --git a/melusine/prepare_email/manage_transfer_reply.py b/melusine/prepare_email/manage_transfer_reply.py new file mode 100644 index 00000000..582e468f --- /dev/null +++ b/melusine/prepare_email/manage_transfer_reply.py @@ -0,0 +1,170 @@ +import re +from melusine.config.config import ConfigJsonReader + +conf_reader = ConfigJsonReader() +config = conf_reader.get_config_file() +regex_transfer_header = config['regex']['manage_transfer_reply']['transfer_header'] +regex_answer_header = config['regex']['manage_transfer_reply']['answer_header'] +regex_begin_transfer = config['regex']['manage_transfer_reply']['begin_transfer'] +regex_begin_transfer_cons = config['regex']['manage_transfer_reply']['begin_transfer_cons'] +regex_extract_from = config['regex']['manage_transfer_reply']['extract_from'] +regex_extract_to = config['regex']['manage_transfer_reply']['extract_to'] +regex_extract_date = config['regex']['manage_transfer_reply']['extract_date'] +regex_extract_header = config['regex']['manage_transfer_reply']['extract_header'] + + +def add_boolean_transfer(row): + """Compute boolean Series which return True if the "header" starts with given + regex 'answer_subject', False if not. + + To be used with methods such as: `apply(func, axis=1)` or + `apply_by_multiprocessing(func, axis=1, **kwargs)`. + + Parameters + ---------- + row : row of pd.Dataframe, columns ['header'] + + Returns + ------- + pd.Series + + Examples + -------- + >>> import pandas as pd + >>> data = pd.read_pickle('./tutorial/data/emails_anonymized.pickle') + >>> # data contains a 'header' column + + >>> from melusine.prepare_email.manage_transfer_reply import add_boolean_transfer + >>> add_boolean_transfer(data.iloc[0]) # apply for 1 sample + >>> data.apply(add_boolean_transfer, axis=1) # apply to all samples + + """ + is_transfer = False + try: + if re.match(regex_transfer_header, row['header']): + is_transfer = True + except Exception as e: + pass + + return is_transfer + + +def add_boolean_answer(row): + """Compute boolean Series which return True if the "header" starts with given + regex 'transfer_subject', False if not. + + To be used with methods such as: `apply(func, axis=1)` or + `apply_by_multiprocessing(func, axis=1, **kwargs)`. + + Parameters + ---------- + row : row of pd.Dataframe, columns ['header'] + + Returns + ------- + pd.Series + + Examples + -------- + >>> import pandas as pd + >>> data = pd.read_pickle('./tutorial/data/emails_anonymized.pickle') + >>> # data contains a 'header' column + + >>> from melusine.prepare_email.manage_transfer_reply import add_boolean_answer + >>> add_boolean_answer(data.iloc[0]) # apply for 1 sample + >>> data.apply(add_boolean_answer, axis=1) # apply to all samples + + """ + is_answer = False + try: + if re.match(regex_answer_header, row['header']): + is_answer = True + except Exception as e: + pass + + return is_answer + + +def check_mail_begin_by_transfer(row): + """Compute boolean Series which return True if the "body" starts with given + regex 'begin_transfer', False if not. + + To be used with methods such as: `apply(func, axis=1)` or + `apply_by_multiprocessing(func, axis=1, **kwargs)`. + + Parameters + ---------- + row : row of pd.Dataframe, columns ['body'] + + Returns + ------- + pd.Series + + Examples + -------- + >>> import pandas as pd + >>> data = pd.read_pickle('./tutorial/data/emails_anonymized.pickle') + >>> # data contains a 'body' column + + >>> from melusine.prepare_email.manage_transfer_reply import check_mail_begin_by_transfer + >>> check_mail_begin_by_transfer(data.iloc[0]) # apply for 1 sample + >>> data.apply(check_mail_begin_by_transfer, axis=1) # apply to all samples + + """ + is_begin_by_transfer = False + try: + if re.search(regex_begin_transfer, row['body']): + is_begin_by_transfer = True + if re.search(regex_begin_transfer_cons, row['body']): + is_begin_by_transfer = True + except Exception as e: + pass + + return is_begin_by_transfer + + +def update_info_for_transfer_mail(row): + """Extracts and updates informations from forwarded mails, such as: body, + from, to, header, date. + - It changes the header by the initial subject (extracted from forward + email). + - It removes the header from emails' body. + + To be used with methods such as: `apply(func, axis=1)` or + `apply_by_multiprocessing(func, axis=1, **kwargs)`. + + Parameters + ---------- + row : row of pd.Dataframe, + columns ['body', 'header', 'from', 'to', 'date', 'is_begin_by_transfer'] + + Returns + ------- + pd.DataFrame + + Examples + -------- + >>> import pandas as pd + >>> from melusine.prepare_email.manage_transfer_reply import check_mail_begin_by_transfer + >>> data = pd.read_pickle('./tutorial/data/emails_anonymized.pickle') + >>> data['is_begin_by_transfer'] = data.apply(check_mail_begin_by_transfer, axis=1) + >>> # data contains columns ['from', 'to', 'date', 'header', 'body', 'is_begin_by_transfer'] + + >>> from melusine.prepare_email.manage_transfer_reply import update_info_for_transfer_mail + >>> update_info_for_transfer_mail(data.iloc[0]) # apply for 1 sample + >>> data.apply(update_info_for_transfer_mail, axis=1) # apply to all samples + + """ + try: + if row['is_begin_by_transfer']: + row['from'] = re.split(regex_extract_from, row['body'])[1] + row['to'] = re.split(regex_extract_to, row['body'])[1] + row['date'] = re.split(regex_extract_date, row['body'])[1] + row['header'] = re.split(regex_extract_header, row['body'])[1] + row['body'] = ''.join(row['body'].split( + re.findall(regex_extract_header, row['body'])[0])[1:]) + + except Exception as E: + pass + + return row diff --git a/melusine/prepare_email/metadata_engineering.py b/melusine/prepare_email/metadata_engineering.py new file mode 100644 index 00000000..b66b41ff --- /dev/null +++ b/melusine/prepare_email/metadata_engineering.py @@ -0,0 +1,187 @@ +import re +import pandas as pd +from collections import Counter +from sklearn import preprocessing +from sklearn.base import BaseEstimator, TransformerMixin + + +class MetaExtension(BaseEstimator, TransformerMixin): + """Transformer which creates 'extension' feature extracted + from regex in metadata. It extracts extension of mail adresses. + + Compatible with scikit-learn API. + """ + + def __init__(self): + self.le_extension = preprocessing.LabelEncoder() + + def fit(self, X, y=None): + """ Fit LabelEncoder on encoded extensions.""" + X['extension'] = X['from'].apply(self.get_extension) + self.top_extension = self.get_top_extension(X, n=100) + X['extension'] = X['extension'].apply( + lambda x: self.encode_extension(x, self.top_extension)) + self.le_extension.fit(X['extension']) + return self + + def transform(self, X): + """Encode extensions""" + X['extension'] = X['from'].apply(self.get_extension) + X['extension'] = X['extension'].apply( + lambda x: self.encode_extension(x, self.top_extension)) + X['extension'] = self.le_extension.transform(X['extension']) + return X + + @staticmethod + def get_extension(x): + """Gets extension from email address.""" + try: + extension = re.findall(r'\@([^.]+)', x)[0] + except Exception as e: + return '' + return extension + + @staticmethod + def get_top_extension(X, n=100): + "Returns list of most common extensions." + a = Counter(X['extension'].values) + a = a.most_common(n) + a = [x[0] for x in a] + return a + + @staticmethod + def encode_extension(x, top_ext): + """Encode most common extensions and set the rest to 'other'.""" + if x in top_ext: + return x + else: + return 'other' + + +class MetaDate(BaseEstimator, TransformerMixin): + """Transformer which creates new features from dates such as: + - hour + - minute + - dayofweek + + Compatible with scikit-learn API. + + Parameters + ---------- + date_format : str, optional + Regex to extract date from text. + + date_format : str, optional + A date format. + """ + + def __init__(self, + regex_date_format=r'\w+ (\d+) (\w+) (\d{4}) (\d{2}) h (\d{2})', + date_format='%d/%m/%Y %H:%M'): + self.regex_date_format = regex_date_format + self.date_format = date_format + self.month = { + 'janvier': '1', + 'février': '2', + 'mars': '3', + 'avril': '4', + 'mai': '5', + 'juin': '6', + 'juillet': '7', + 'août': '8', + 'septembre': '9', + 'octobre': '10', + 'novembre': '11', + 'décembre': '12', + } + + def fit(self, X, y=None): + """Unused method. Defined only for compatibility with scikit-learn API. + """ + return self + + def transform(self, X): + """Transform date to hour, min, day features.""" + X['date'] = X['date'].apply(self.date_formatting, + args=(self.regex_date_format, )) + X['date'] = pd.to_datetime(X['date'], + format=self.date_format, + infer_datetime_format=False, + errors='coerce') + X['hour'] = X['date'].apply(self.get_hour) + X['min'] = X['date'].apply(self.get_min) + X['dayofweek'] = X['date'].apply(self.get_dayofweek) + return X + + def date_formatting(self, x, regex_format): + """Set a date in the right format""" + try: + e = re.findall(regex_format, x)[0] + date = e[0]+'/'+e[1]+'/'+e[2]+' '+e[3]+':'+e[4] + for m, m_n in self.month.items(): + date = date.replace(m, m_n) + except Exception as e: + return x + return date + + @staticmethod + def get_hour(x): + """Get hour from date""" + try: + return x.hour + except Exception as e: + return 0 + + @staticmethod + def get_min(x): + """Get minutes from date""" + try: + return x.minute + except Exception as e: + return 0 + + @staticmethod + def get_dayofweek(x): + """Get day of the week from date""" + try: + return x.dayofweek + except Exception as e: + return 0 + + +class Dummifier(BaseEstimator, TransformerMixin): + """Transformer to dummifies categorial features. + Compatible with scikit-learn API. + """ + + def __init__(self, + columns_to_dummify=['extension', 'dayofweek', 'hour', 'min'], + copy=True): + self.columns_to_dummify = columns_to_dummify + self.copy = copy + pass + + def fit(self, X, y=None): + """Store dummified features to avoid inconsistance of + new data which could contain new labels (unknown from train data). + """ + self.X_ = pd.get_dummies( + X, columns=self.columns_to_dummify, prefix_sep='__', dummy_na=False) + + dummies_ = tuple([col + '__' for col in self.columns_to_dummify]) + self.dummy_features = [c for c in self.X_.columns if c.startswith(dummies_)] + + return self + + def transform(self, X, y=None): + """Dummify features and keep only common labels with pretrained data. + """ + if self.copy: + X_ = X.copy() + else: + X_ = X + + X_ = pd.get_dummies( + X_, columns=self.columns_to_dummify, prefix_sep='__', dummy_na=False) + + return X_[self.dummy_features] diff --git a/melusine/summarizer/__init__.py b/melusine/summarizer/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/melusine/summarizer/keywords_generator.py b/melusine/summarizer/keywords_generator.py new file mode 100644 index 00000000..8dd2e60b --- /dev/null +++ b/melusine/summarizer/keywords_generator.py @@ -0,0 +1,260 @@ +import numpy as np +import pandas as pd +import scipy.sparse as sp +from collections import Counter +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.feature_extraction.text import TfidfVectorizer +from melusine.utils.multiprocessing import apply_by_multiprocessing +from melusine.config.config import ConfigJsonReader + +conf_reader = ConfigJsonReader() +config = conf_reader.get_config_file() + +keywords = config["words_list"]["keywords"] +prenoms_list = config["words_list"]["names"] +newStopWords = config["words_list"]["stopwords"] +stopwords = prenoms_list + newStopWords + + +class KeywordsGenerator(BaseEstimator, TransformerMixin): + """Class to extract list of keywords from text. + + It is compatible with scikit-learn API (i.e. contains fit, transform + methods). + + Parameters + ---------- + + max_tfidf_features : int, optional + Size of vocabulary for tfidf. + Default value, 10000. + + keywords : list, optional + Keywords to extracted as priority. + Default value, "keywords" list defined in conf file. + + stopwords : list, optional + Stopwords not to be extracted. + Default value, "names" and "stopwords" lists defined in conf file. + + resample : bool, optional + True if dataset must be resampled according to class distribution, + else False. + Default value, True. + + n_jobs : int, optional + Number of cores used for computation. + Default value, 20. + + copy : bool, optional + Make a copy of DataFrame. + Default value, True. + + n_max_keywords : int, optional + Maximum number of keywords to be returned. + Default value, 6. + + n_min_keywords : int, optional + Minimum number of keywords to be returned. + Default value, 0. + + threshold_keywords : float, optional + Minimum tf-idf score for word to be selected as keyword. + Default value, 0.0. + + n_docs_in_class : int, optional + Number of documents in each classes. + Default value, 100. + + keywords_coef : int, optional + Coefficient multiplied with the tf-idf scores of each keywords. + Default value, 10. + + Attributes + ---------- + max_tfidf_features, keywords, stopwords, resample, n_jobs, progress_bar, + copy, n_max_keywords, n_min_keywords, threshold_keywords, n_docs_in_class, + keywords_coef, + + tfidf_vectorizer : TfidfVectorizer instance from sklearn, + + dict_scores_ : dictionary, + Tf-idf scores for each tokens. + + max_score_ : np.array, + + Examples + -------- + >>> from melusine.summarizer.keywords_generator import KeywordsGenerator + >>> keywords_generator = KeywordsGenerator() + >>> keywords_generator.fit(X, y) + >>> keywords_generator.transform(X) + >>> print(X['keywords']) + + """ + + def __init__(self, + max_tfidf_features=10000, + keywords=keywords, + stopwords=stopwords, + resample=False, + n_jobs=20, + progress_bar=True, + copy=True, + n_max_keywords=6, + n_min_keywords=0, + threshold_keywords=0.0, + n_docs_in_class=100, + keywords_coef=10): + self.max_tfidf_features_ = max_tfidf_features + self.tfidf_vectorizer = TfidfVectorizer(max_features=max_tfidf_features + ) + self.keywords = keywords + self.stopwords = stopwords + self.resample = resample + self.n_jobs = n_jobs + self.progress_bar = progress_bar + self.copy = copy + self.n_max_keywords = n_max_keywords + self.n_min_keywords = n_min_keywords + self.threshold_keywords = threshold_keywords + self.n_docs_in_class = n_docs_in_class + self.keywords_coef = keywords_coef + + def fit(self, X, y=None): + """Fit the weighted tf-idf model with input data. + + If resample attribute is True the dataset will be resampled according + to class distribution. + + Parameters + ---------- + X : pandas.DataFrame, shape (n_samples, n_features) + X must contain ['tokens'] column. + + y : Ignored + + Returns + ------- + self : object + Returns the instance itself. + """ + if self.resample: + X_resample = self.resample_docs(X, y) + else: + X_resample = X + + X_resample['tokens'] = X_resample['tokens'].apply(self._remove_stopwords) + + # fit tf-idf on resample data set + tokens_joined = X_resample['tokens'].apply(lambda x: ' '.join(x)) + self.tfidf_vectorizer.fit(tokens_joined) + + # modify the idf weights given frequency in the corpus + idf_weights = self._add_tf_to_idf(X_resample) + self.tfidf_vectorizer._tfidf._idf_diag = sp.spdiags(idf_weights, + diags=0, + m=len(idf_weights), + n=len(idf_weights)) + + # return vetorizer with binary term frequency atribute + self.dict_scores_ = dict(zip(self.tfidf_vectorizer.get_feature_names(), + self.tfidf_vectorizer.idf_)) + self.max_score_ = np.max(self.tfidf_vectorizer.idf_) + + return self + + def transform(self, X): + """Returns list of keywords in apparition order for each document + with the weighted tf-idf already fitted. + + Parameters + ---------- + X : pandas.DataFrame, shape (n_samples, n_features) + X must contain ['tokens'] column. + + Returns + ------- + X_new : pandas.DataFrame, shape (n_samples, n_components) + """ + if self.copy: + X_ = X.copy() + else: + X_ = X + + X_['keywords'] = apply_by_multiprocessing(df=X_[['tokens']], + func=self.get_keywords, + axis=1, + workers=self.n_jobs, + progress_bar=self.progress_bar) + + return X_ + + def get_keywords(self, row): + """Returns list of keywords in apparition order with the + weighted tf-idf already fitted. + + Parameters + ---------- + row : row of pd.Dataframe, columns ['tokens'] + + Returns + ------- + list of strings + """ + tokens = self._remove_stopwords(row['tokens']) + tokens = [x for x in tokens if not x.isdigit()] + scores = Counter({t: self.dict_scores_.get(t, 0) for t in tokens}) + n = sum(i > self.threshold_keywords for i in list(scores.values())) + n = min(n, self.n_max_keywords) + n = max(n, self.n_min_keywords) + keywords = [x[0] for x in scores.most_common(n)] + index_sorted = [(k, tokens.index(k)) for k in keywords if k in tokens] + index_sorted = sorted(index_sorted, key=lambda x: x[1]) + keywords_sorted = [i[0] for i in index_sorted] + + return keywords_sorted + + def resample_docs(self, X, y=None): + """Method for resampling documents according to class distribution.""" + X_ = X.copy() + if y is not None: + X_['label'] = y + X_['split'] = 0 + for c in X_.label.unique(): + N_c = X_[X_["label"] == c].shape[0] + I_c = np.random.randint(0, self.n_docs_in_class+1, N_c) + X_.loc[X_["label"] == c, 'split'] = I_c + + X_resample = pd.DataFrame( + X_[['label', 'split', 'tokens']] + .groupby(['label', 'split'], as_index=False)['tokens'] + .sum() + ) + + return X_resample + + def _remove_stopwords(self, tokens): + """Method to filter stopwords from potential list of keywords.""" + return [t for t in tokens if t not in self.stopwords] + + def _add_tf_to_idf(self, X): + """Returns the tf-idf weights of each tokens""" + tokens_joined = X['tokens'].apply(lambda x: ' '.join(x)) + X_vec = self.tfidf_vectorizer.transform(tokens_joined) + feature_names = self.tfidf_vectorizer.get_feature_names() + idf_weights = self._get_weights(X_vec.toarray(), + self.keywords, + feature_names) + + return idf_weights + + def _get_weights(self, X_vec, keywords_list, feature_names): + """Put max weights for each word of redistributed mails.""" + max_ = np.max(X_vec, axis=0) + mmax_ = np.max(max_) + for k in keywords_list: + if k in feature_names: + max_[feature_names.index(k)] = mmax_ * self.keywords_coef + + return max_ diff --git a/melusine/utils/__init__.py b/melusine/utils/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/melusine/utils/multiprocessing.py b/melusine/utils/multiprocessing.py new file mode 100644 index 00000000..80810179 --- /dev/null +++ b/melusine/utils/multiprocessing.py @@ -0,0 +1,48 @@ +import numpy as np +import pandas as pd +from tqdm import tqdm +from multiprocessing import Pool + + +def _apply_df(args): + """Apply a function along an axis of the DataFrame""" + df, func, kwargs = args + if 'progress_bar' not in kwargs: + progress_bar = False + else: + progress_bar = kwargs.pop('progress_bar') + + if progress_bar: + tqdm.pandas(leave=False, desc=func.__name__, ncols=100, unit='emails') + return df.progress_apply(func, **kwargs) + return df.apply(func, **kwargs) + + +def apply_by_multiprocessing(df, func, **kwargs): + """Apply a function along an axis of the DataFrame using multiprocessing. + A maximum of half of the core available in the system will be used. + + Parameters + ---------- + df : pd.DataFrame + DataFrame where the function is applied + + func : function to apply + + Returns + ------- + pd.DataFrame + Returns the DataFrame with the funtion applied. + """ + # define the number of cores to work with + workers = kwargs.pop('workers') + workers = min(workers, int(df.shape[0] / 2)) + workers = max(workers, 1) + if df.shape[0] == 1: + return _apply_df((df, func, kwargs)) + + pool = Pool(processes=workers) + result = pool.map(_apply_df, [(d, func, kwargs) + for d in np.array_split(df, workers)]) + pool.close() + return pd.concat(list(result)) diff --git a/melusine/utils/printer.py b/melusine/utils/printer.py new file mode 100644 index 00000000..fae4fe17 --- /dev/null +++ b/melusine/utils/printer.py @@ -0,0 +1,60 @@ + +def print_color_mail(structured_body): + """Highlight the tagged sentences. + + Parameters + ---------- + structured_body : a structured body from process_sent_tag, + + Returns + ------- + Print the mail by sentence. + + """ + for message in structured_body: + print("___________________________\n") + print_color(str(message.get("meta")), "META") + + structured_text = message.get("structured_text") + header = structured_text.get("header") + print_color(str(header), "HEADER") + + for sentence in structured_text.get("text"): + text = sentence.get("part") + tag = sentence.get("tags") + print_color(text, tag) + + +def print_color(text, part=None): + """Select according to the tag the right color to use when printing.""" + switcher_tag = { + "HELLO": "\033[0;37;44m" + "HELLO" + "\033[0m", + "GREETINGS": "\033[0;37;45m" + "GREETINGS" + "\033[0m", + "SIGN": "\033[0;37;41m" + "SIGN" + "\033[0m", + "THANKS": "\033[0;31;46m" + "THANKS" + "\033[0m", + "PJ": "\033[0;37;42m" + "PJ" + "\033[0m", + "META": "\33[43m" + "META" + "\033[0m", + "FOOTER": "\33[41m" + "FOOTER" + "\033[0m", + "DISCLAIMER": "\33[41m" + "DISCLAIMER" + "\033[0m", + "TYPO": "\33[47m" + "TYPO" + "\033[0m", + "HEADER": "\033[0;37;41m" + "HEADER" + "\033[0m" + } + + switcher = { + "HELLO": "\033[0;37;44m" + text + "\033[0m", + "GREETINGS": "\033[0;37;45m" + text + "\033[0m", + "SIGN": "\033[0;37;41m" + text + "\033[0m", + "THANKS": "\033[0;31;46m" + text + "\033[0m", + "PJ": "\033[0;37;42m" + text + "\033[0m", + "META": "\33[43m" + text + "\033[0m", + 'FOOTER': "\33[41m" + text + "\033[0m", + 'DISCLAIMER': "\33[41m" + text + "\033[0m", + 'TYPO': "\33[47m" + text + "\033[0m", + "HEADER": "\033[0;37;41m" + text + "\033[0m" + } + + # print("TAG : ", switcher_tag.get(part, text)) + if part == "BODY": + print("> BODY : ", switcher.get(part, text)) + else: + print("> ", switcher_tag.get(part, text), " : ", switcher.get(part, text)) diff --git a/melusine/utils/streamer.py b/melusine/utils/streamer.py new file mode 100644 index 00000000..c67a6460 --- /dev/null +++ b/melusine/utils/streamer.py @@ -0,0 +1,99 @@ +import nltk +from melusine.prepare_email.mail_segmenting import split_message_to_sentences +from melusine.utils.multiprocessing import apply_by_multiprocessing + + +class Streamer(): + """Class to transform pd.Series into stream. + + Used to prepare the data for the training of the phraser and embeddings. + + Attributes + ---------- + columns : str or list of str, + Input text column(s) to consider for the streamer. + + stream : MailIterator object, + Stream of all the tokens of the pd.Series. + + Examples + -------- + >>> streamer = Streamer() + >>> streamer.to_stream(X) # will build the stream attribute + >>> tokens_stream = = streamer.stream + >>> print(tokens_stream) + + """ + + def __init__(self, columns='clean_body', n_jobs=40): + self.columns_ = columns + self.n_jobs = n_jobs + + def to_stream(self, X): + """Build a MailIterator object containing a stream of tokens from + a pd.Series. + + Parameters + ---------- + X : pd.Dataframe. + + Examples + -------- + >>> streamer.to_stream(X) # will build the stream attribute + >>> tokens_stream = = streamer.stream + >>> print(tokens_stream) + + """ + flattoks = self.to_flattoks(X) + self.stream = MailIterator(flattoks) + pass + + def to_flattoks(self, X): + """Create list of list of tokens from a pd.Series + Each list of tokens correspond to a sentence. + + Parameters + ---------- + X : pd.Dataframe, + + Returns + ------- + list of lists of strings + """ + tokenized_sentences_list = apply_by_multiprocessing(X[self.columns_], + self.to_list_of_tokenized_sentences, + workers=self.n_jobs + ) + flattoks = [item for sublist in tokenized_sentences_list + for item in sublist] + return flattoks + + def to_list_of_tokenized_sentences(self, text): + """Create list of list of tokens from a text. + Each list of tokens correspond to a sentence. + + Parameters + ---------- + text : str + + Returns + ------- + list of list of strings + """ + sentences_list = split_message_to_sentences(text) + tokenized_sentences_list = [nltk.regexp_tokenize(sentence, + pattern="\w+(?:[\?\-\'\"_]\w+)*") + for sentence in sentences_list + if sentence != ""] + return tokenized_sentences_list + + +class MailIterator(): + """Class to transform stream of tokens into iterators.""" + + def __init__(self, tok_stream): + self.tok_stream = tok_stream + + def __iter__(self): + for sent in self.tok_stream: + yield sent diff --git a/melusine/utils/transformer_scheduler.py b/melusine/utils/transformer_scheduler.py new file mode 100644 index 00000000..6e5de6c4 --- /dev/null +++ b/melusine/utils/transformer_scheduler.py @@ -0,0 +1,187 @@ +""" +Useful class to define its own transformer using specific functions +in a specific order to apply along a row of DataFrame (axis=1). + +It is compatible with scikit-learn API (i.e. contains fit, transform methods). +""" + +import pandas as pd +from sklearn.base import BaseEstimator, TransformerMixin +from melusine.utils.multiprocessing import apply_by_multiprocessing + + +def __check_function_type(func): + """Check if it is a function-like object.""" + if not callable(func): + raise TypeError("First item of the tuple (func, args, cols) must be a function-like \ +object not a {} object".format(type(func))) + else: + return func + + +def __check_args_type(args): + """Check if it is a tuple-like object.""" + if args is None or args == (): + return None + elif isinstance(args, int) or isinstance(args, str) or isinstance(args, list): + # manage the case of 1 element in tuple (example: args=(4)) + return (args, ) + elif not isinstance(args, tuple): + raise TypeError("Second item of the tuple (func, args, cols) must be tuple-like \ +object not a {} object".format(type(args))) + else: + return args + + +def __check_colnames_type(cols): + """Check if it is a list-like object.""" + if cols is None or cols == []: + return None + elif not isinstance(cols, list): + raise TypeError("Third item of the tuple (func, args, cols) must be list-like \ +object not a {} object".format(type(cols))) + else: + return cols + + +def _check_tuple(func, args=None, cols=None): + """Complete checking of each element for the 'function_scheduler' + parameter.""" + # check types of each parameters + func = __check_function_type(func) + args = __check_args_type(args) + cols = __check_colnames_type(cols) + + return (func, args, cols) + + +class TransformerScheduler(BaseEstimator, TransformerMixin): + """ + This class aims to provide a good way to define its own transformer. + It takes a list of function defined in a specific order to apply along a + row of DataFrame (axis=1). + Transformer returned is compatible with scikit-learn API + (i.e. contains fit, transform methods). + + Parameters + ---------- + functions_scheduler : list of tuples, (function, tuple, list) + List of function to be applied in a specific order. + Each element of the list has to be defined as follow: + (`function`, `argument(s) used by the function (optional)`, `colname(s) + returned (optional)`) + + mode : str {'apply', 'apply_by_multiprocessing'}, optional + Define mode to apply function along a row axis (axis=1). + Default value, 'apply'. + If set to 'apply_by_multiprocessing', it uses multiprocessing tool + to parallelize computation. + + n_jobs : int, optional + Number of cores used for computation. Default value, 1. + + progress_bar : boolean, optional + Whether to print a progress bar from tqdm package. Default value, True. + Works only when mode is set to 'apply_by_multiprocessing'. + + copy : boolean, optional + Make a copy of DataFrame. Default value, True. + + verbose : int, optional + Verosity mode, print loggers. Default value, 0. + + Attributes + ---------- + function_scheduler, mode, n_jobs, progress_bar + + Examples + -------- + >>> from melusine.utils.transformer_scheduler import TransformerScheduler + + >>> MelusineTransformer = TransformerScheduler( + >>> functions_scheduler=[ + >>> (my_function_1, (argument1, argument2), ['return_col_A']), + >>> (my_function_2, None, ['return_col_B', 'return_col_C']) + >>> (my_function_3, (), ['return_col_D']) + >>> ]) + + """ + + def __init__(self, + functions_scheduler, + mode='apply', + n_jobs=None, + progress_bar=True, + copy=True, + verbose=0): + self.functions_scheduler = functions_scheduler + self.mode = mode + self.n_jobs = n_jobs + self.progress_bar = progress_bar + self.copy = copy + self.verbose = verbose + + # check input parameters type + for tuple_ in functions_scheduler: + func, args, cols = _check_tuple(*tuple_) + + if n_jobs is None: + self.n_jobs = 1 + pass + + def fit(self, X, y=None): + """Unused method. Defined only for compatibility with scikit-learn API. + """ + return self + + def transform(self, X): + """Apply functions defined in the `function_scheduler` parameter. + + Parameters + ---------- + X : pandas.DataFrame, + Data on which transformations are applied. + + Returns + ------- + pandas.DataFrame + """ + if self.copy: + X_ = X.copy() + else: + X_ = X + + for tuple_ in self.functions_scheduler: + func_, args_, cols_ = _check_tuple(*tuple_) + + if self.mode == 'apply': + if cols_ is None: + X_ = X_.apply(func_, args=args_, axis=1) + elif len(cols_) == 1: + X_[cols_[0]] = X_.apply(func_, args=args_, axis=1) + else: + X_[cols_] = X_.apply(func_, args=args_, axis=1).apply(pd.Series) + else: # 'apply_by_multiprocessing' + if cols_ is None: + X_ = apply_by_multiprocessing(df=X_, + func=func_, + args=args_, + axis=1, + workers=self.n_jobs, + progress_bar=self.progress_bar) + elif len(cols_) == 1: + X_[cols_[0]] = apply_by_multiprocessing(df=X_, + func=func_, + args=args_, + axis=1, + workers=self.n_jobs, + progress_bar=self.progress_bar) + else: + X_[cols_] = apply_by_multiprocessing(df=X_, + func=func_, + args=args_, + axis=1, + workers=self.n_jobs, + progress_bar=self.progress_bar).apply(pd.Series) + + return X_ diff --git a/requirements_dev.txt b/requirements_dev.txt new file mode 100644 index 00000000..36d98b17 --- /dev/null +++ b/requirements_dev.txt @@ -0,0 +1,13 @@ +pip==18.1 +bumpversion==0.5.3 +wheel==0.32.1 +watchdog==0.9.0 +flake8==3.5.0 +tox==3.5.2 +coverage==4.5.1 +Sphinx==1.8.1 +sphinx-bootstrap-theme==0.6.5 +twine==1.12.1 +pytest==3.8.2 +pytest-runner==4.2 +numpydoc==0.8.0 diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 00000000..67ac2b4d --- /dev/null +++ b/setup.cfg @@ -0,0 +1,30 @@ +[bumpversion] +current_version = 1.1.3 +commit = True +tag = True + +[bumpversion:file:setup.py] +search = version='{current_version}' +replace = version='{new_version}' + +[bumpversion:file:melusine/__init__.py] +search = __version__ = '{current_version}' +replace = __version__ = '{new_version}' + +[bdist_wheel] +universal = 1 + +[flake8] +exclude = docs +max-line-length = 100 + +[aliases] +test = pytest + +[tool:pytest] +collect_ignore = ['setup.py'] +pep8maxlinelength = 100 + +[pep8] +max-line-length = 100 + diff --git a/setup.py b/setup.py new file mode 100644 index 00000000..bb2f77de --- /dev/null +++ b/setup.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +"""The setup script.""" + +from setuptools import setup + +with open('README.md') as readme_file: + readme = readme_file.read() + +with open('HISTORY.rst') as history_file: + history = history_file.read() + +requirements = ['pandas>=0.22.0', + 'scikit-learn>=0.19.0', + 'gensim>=3.3.0', + 'nltk>=3.3', + 'keras>=2.2.0', + 'tqdm>=4.14', + 'tensorflow>=1.10.0', + 'unidecode' + ] + +setup_requirements = ['pytest-runner', ] + +test_requirements = ['pytest', ] + +setup( + author="Sacha Samama, Tom Stringer, Antoine Simoulin", + author_email='ssamama@quantmetry.com', + classifiers=[ + 'Development Status :: 2 - Pre-Alpha', + 'Intended Audience :: Developers', + 'License :: OSI Approved :: Apache Software License', + 'Natural Language :: English', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.5', + 'Programming Language :: Python :: 3.6', + 'Programming Language :: Python :: 3.7', + ], + description="Melusine is a high-level package for french emails preprocessing, classification and feature extraction, written in Python.", + entry_points={}, + install_requires=requirements, + license="Apache Software License 2.0", + long_description=readme, + long_description_content_type='text/markdown', + include_package_data=True, + keywords='melusine', + name='melusine', + package_dir={ + 'melusine': 'melusine', + 'melusine.config': 'melusine/config', + 'melusine.utils': 'melusine/utils', + 'melusine.nlp_tools': 'melusine/nlp_tools', + 'melusine.prepare_email': 'melusine/prepare_email', + 'melusine.summarizer': 'melusine/summarizer', + 'melusine.models': 'melusine/models' + }, + packages=['melusine', 'melusine.config', 'melusine.utils', + 'melusine.nlp_tools', 'melusine.prepare_email', + 'melusine.summarizer', 'melusine.models'], + data_files=[('config', ['melusine/config/conf.json'])], + setup_requires=setup_requirements, + test_suite='tests', + tests_require=test_requirements, + url='https://github.com/MAIF/melusine', + version='1.1.3', + zip_safe=False, +) diff --git a/tests/unit_tests/prepare_email/test_body_header_extraction.py b/tests/unit_tests/prepare_email/test_body_header_extraction.py new file mode 100644 index 00000000..04c764dd --- /dev/null +++ b/tests/unit_tests/prepare_email/test_body_header_extraction.py @@ -0,0 +1,83 @@ +import pandas as pd +import numpy as np +from melusine.prepare_email.body_header_extraction import extract_last_body +from melusine.prepare_email.body_header_extraction import extract_body +from melusine.prepare_email.body_header_extraction import extract_header + + +structured_body = [ + {'meta': {'date': None, + 'from': None, + 'to': None}, + 'structured_text': {'header': 'demande document', + 'text': [{'part': 'Bonjour. ', 'tags': 'HELLO'}, + {'part': "Je vous remercie pour le document", + 'tags': 'BODY'}, + {'part': 'Cordialement,', 'tags': 'GREETINGS'}, + {'part': 'Mr Unknown', 'tags': 'BODY'} + ] + } + }, + {'meta': {'date': ' mar. 22 mai 2018 à 10:20', + 'from': ' ', + 'to': None}, + 'structured_text': {'header': 'demande document', + 'text': [{'part': 'Bonjour. ', 'tags': 'HELLO'}, + {'part': "Merci de bien vouloir prendre connaissance du document ci-joint", + 'tags': 'BODY'}, + {'part': 'Cordialement,', 'tags': 'GREETINGS'}, + {'part': 'Votre mutuelle', 'tags': 'BODY'}, + {'part': 'La visualisation des fichiers PDF nécessite Adobe Reader.', + 'tags': 'FOOTER'} + ] + } + } + ] + + +def test_extract_last_body(): + input_df = pd.DataFrame({ + 'structured_body': [structured_body] + }) + + output_df = pd.Series(["Je vous remercie pour le document "]) + + result = input_df.apply(extract_last_body, axis=1) + pd.testing.assert_series_equal(result, output_df) + + +message_dict = {'meta': {'date': ' mar. 22 mai 2018 à 10:20', + 'from': ' ', + 'to': None}, + 'structured_text': {'header': 'demande document', + 'text': [{'part': 'Bonjour. ', + 'tags': 'HELLO'}, + {'part': "Merci de bien vouloir prendre connaissance du document ci-joint", + 'tags': 'BODY'}, + {'part': 'Cordialement,', + 'tags': 'GREETINGS'}, + {'part': 'Votre mutuelle', + 'tags': 'BODY'}, + {'part': 'La visualisation des fichiers PDF nécessite Adobe Reader.', + 'tags': 'FOOTER'} + ] + } + } + + +def test_extract_body(): + input_dict = message_dict + + output = "Merci de bien vouloir prendre connaissance du document ci-joint " + + result = extract_body(input_dict) + np.testing.assert_string_equal(result, output) + + +def test_extract_header(): + input_dict = message_dict + + output = 'demande document' + + result = extract_header(input_dict) + np.testing.assert_string_equal(result, output) diff --git a/tests/unit_tests/prepare_email/test_build_historic.py b/tests/unit_tests/prepare_email/test_build_historic.py new file mode 100644 index 00000000..49865a6d --- /dev/null +++ b/tests/unit_tests/prepare_email/test_build_historic.py @@ -0,0 +1,35 @@ +import pandas as pd +from melusine.prepare_email.build_historic import build_historic + +body = " \n \n \n Bonjours, \n \n Suite a notre conversation téléphonique \ +de Mardi , pourriez vous me dire la \n somme que je vous dois afin d'd'être \ +en régularisation . \n \n Merci bonne journée \n \n Le mar. 22 mai 2018 \ +à 10:20, a écrit\xa0: \n Bonjour. \n \n \ +Merci de bien vouloir prendre connaissance du document ci-joint : \n 1 - \ +Relevé d'identité postal MUTUELLE (contrats) \n \n Sentiments \ +mutualistes. \n \n La Mututelle \n \n La visualisation des fichiers PDF \ +nécessite Adobe Reader. \n " + + +output = [ + {'text': " \n \n \n Bonjours, \n \n Suite a notre conversation \ +téléphonique de Mardi , pourriez vous me dire la \n somme que je vous \ +dois afin d'd'être en régularisation . \n \n Merci bonne journée", + 'meta': ''}, + {'text': " \n Bonjour. \n \n Merci de bien vouloir prendre connaissance \ +du document ci-joint : \n 1 - Relevé d'identité postal MUTUELLE \ +(contrats) \n \n Sentiments mutualistes. \n \n La Mututelle \n \n \ +La visualisation des fichiers PDF nécessite Adobe Reader. \n ", + 'meta': ' \n \n Le mar. 22 mai 2018 à 10:20, \ + a écrit\xa0:'}] + + +def test_build_historic(): + input_df = pd.DataFrame({ + 'body': [body] + }) + + output_df = pd.Series([output]) + + result = input_df.apply(build_historic, axis=1) + pd.testing.assert_series_equal(result, output_df) diff --git a/tests/unit_tests/prepare_email/test_cleaning.py b/tests/unit_tests/prepare_email/test_cleaning.py new file mode 100644 index 00000000..3c23e827 --- /dev/null +++ b/tests/unit_tests/prepare_email/test_cleaning.py @@ -0,0 +1,54 @@ +import pytest +import numpy as np +from melusine.prepare_email.cleaning import ( + remove_multiple_spaces_and_strip_text, remove_accents, + flag_items, remove_transfer_answer_header) + + +@pytest.mark.parametrize("input_str, expected_str", [ + ("hello world", "hello world"), + ("\n hello world ", "hello world"), + ("----- hello\tworld *****", "hello world"), + ("hello-world", "hello-world"), + ("hello - world", "hello world") +]) +def test_remove_multiple_spaces_and_strip_text(input_str, expected_str): + result = remove_multiple_spaces_and_strip_text(input_str) + np.testing.assert_string_equal(result, expected_str) + + +def test_remove_accents(): + input_str = "éèëêàù" + expected_str = "eeeeau" + + result = remove_accents(input_str) + np.testing.assert_string_equal(result, expected_str) + + +@pytest.mark.parametrize("input_str, expected_str", [ + ("RE: hello world", " hello world"), + ("re :", ""), + ("TR: hello", " hello"), + ("hello ", "hello "), + ("Fwd:hello", "hello") +]) +def test_remove_transfer_answer_header(input_str, expected_str): + result = remove_transfer_answer_header(input_str) + np.testing.assert_string_equal(result, expected_str) + + +@pytest.mark.parametrize("input_str, expected_str", [ + ("Bonjour, mon email : prenom.nom@hotmail.fr", "Bonjour, mon email : flag_mail_ "), + ("Mon numéro : 01.23.45.67.89", "Mon numéro : flag_phone_ "), + ("01 23 45 67 89 et 01.23.45.67.89", " flag_phone_ et flag_phone_ "), + ("mon numéro01 23 45 67 89", "mon numéro flag_phone_ "), + ("le montant du contrat est du 18000$, soit 17000euros", + "le montant du contrat est du flag_amount_ , soit flag_amount_ "), + ("J'habite au 1 rue de la paix, Paris 75002", + "J'habite au 1 rue de la paix, Paris flag_cp_ "), + ("Rendez-vous le 18 décembre 2019 ou le 19/12/19 ou le 20.12.19 à 14h30", + "Rendez-vous le flag_date_ ou le flag_date_ ou le flag_date_ à flag_time_ ") +]) +def test_flag_items(input_str, expected_str): + result = flag_items(input_str) + np.testing.assert_string_equal(result, expected_str) diff --git a/tests/unit_tests/prepare_email/test_mail_segmenting.py b/tests/unit_tests/prepare_email/test_mail_segmenting.py new file mode 100644 index 00000000..682ddbca --- /dev/null +++ b/tests/unit_tests/prepare_email/test_mail_segmenting.py @@ -0,0 +1,58 @@ +import pandas as pd +from melusine.prepare_email.mail_segmenting import structure_email + +structured_historic = [ + {'text': " \n \n \n Bonjours, \n \n Suite a notre conversation \ +téléphonique de Mardi , pourriez vous me dire la \n somme que je vous \ +dois afin d'd'être en régularisation . \n \n Merci bonne journée", + 'meta': ''}, + {'text': " \n Bonjour. \n \n Merci de bien vouloir prendre connaissance \ +du document ci-joint : \n 1 - Relevé d'identité postal MUTUELLE \ +(contrats) \n \n Sentiments mutualistes. \n \n La Mututelle \n \n \ +La visualisation des fichiers PDF nécessite Adobe Reader. \n ", + 'meta': ' \n \n Le mar. 22 mai 2018 à 10:20, \ + a écrit\xa0:'}] + +output = [{'meta': {'date': None, 'from': None, 'to': None}, + 'structured_text': {'header': None, + 'text': [ + {'part': ' Bonjours, ', + 'tags': 'HELLO'}, + {'part': " Suite a notre conversation \ +téléphonique de Mardi , pourriez vous me dire la somme que je vous dois \ +afin d'd'être en régularisation . \n \n ", + 'tags': 'BODY'}, + {'part': 'Merci bonne journée', + 'tags': 'GREETINGS'} + ] + } + }, + {'meta': {'date': ' mar. 22 mai 2018 à 10:20', + 'from': ' ', + 'to': None}, + 'structured_text': {'header': None, + 'text': [ + {'part': ' Bonjour. \n \n ', + 'tags': 'HELLO'}, + {'part': "Merci de bien vouloir prendre \ +connaissance du document ci-joint : 1 - Relevé d'identité postal MUTUELLE \ +(contrats) ", + 'tags': 'BODY'}, + {'part': ' Sentiments mutualistes. ', + 'tags': 'GREETINGS'}, + {'part': ' La Mututelle ', + 'tags': 'BODY'}, + {'part': ' La visualisation des fichiers \ +PDF nécessite Adobe Reader. \n', + 'tags': 'FOOTER'}]}}] + + +def test_structure_email(): + input_df = pd.DataFrame({ + 'structured_historic': [structured_historic] + }) + + output_df = pd.Series([output]) + + result = input_df.apply(structure_email, axis=1) + pd.testing.assert_series_equal(result, output_df) diff --git a/tests/unit_tests/prepare_email/test_manage_transfer_reply.py b/tests/unit_tests/prepare_email/test_manage_transfer_reply.py new file mode 100644 index 00000000..b50baf89 --- /dev/null +++ b/tests/unit_tests/prepare_email/test_manage_transfer_reply.py @@ -0,0 +1,60 @@ +import pandas as pd +import numpy as np +from melusine.prepare_email.manage_transfer_reply import ( + add_boolean_answer, add_boolean_transfer, check_mail_begin_by_transfer) + + +def test_add_boolean_answer(): + input_df = pd.DataFrame({ + 'header': ["Bonjour, je suis disponible", + "RE: bonjour", + "re: bonjour", + "TR: bonjour", + "Fwd: bonjour", + "Re: bonjour", + np.nan, + ''], + }) + + output_df = pd.Series([False, True, True, False, False, True, False, + False]) + + result = input_df.apply(add_boolean_answer, axis=1) + pd.testing.assert_series_equal(result, output_df) + + +def test_add_boolean_transfer(): + input_df = pd.DataFrame({ + 'header': ["Bonjour, je suis disponible", + "RE: bonjour", + "re: bonjour", + "TR: bonjour", + "Tr: bonjour", + "Fwd: bonjour", + "FW: bonjour", + "FWD: bonjour", + "Fw: bonjour", + np.nan, + ''], + }) + + output_df = pd.Series([False, False, False, True, True, True, True, True, + True, False, False]) + + result = input_df.apply(add_boolean_transfer, axis=1) + pd.testing.assert_series_equal(result, output_df) + + +def test_check_mail_begin_by_transfer(): + input_df = pd.DataFrame({ + 'body': ["Bonjour, je suis disponible", + "--- Transféré par ---- Bonjour, je suis disponible", + "De : : salut, je suis disponible", + np.nan, + ''] + }) + + output_df = pd.Series([False, True, True, False, False]) + + result = input_df.apply(check_mail_begin_by_transfer, axis=1) + pd.testing.assert_series_equal(result, output_df) diff --git a/tox.ini b/tox.ini new file mode 100644 index 00000000..8424adc7 --- /dev/null +++ b/tox.ini @@ -0,0 +1,25 @@ +[tox] +envlist = py35, py36, py37, flake8 + +[travis] +python = + 3.7: py37 + 3.6: py36 + 3.5: py35 + +[testenv:flake8] +basepython = python +deps = flake8 +commands = flake8 melusine + +[testenv] +setenv = + PYTHONPATH = {toxinidir} +deps = + -r{toxinidir}/requirements_dev.txt +; If you want to make tox run the tests with the same versions, create a +; requirements.txt with the pinned versions and uncomment the following line: +; -r{toxinidir}/requirements.txt +commands = + pip install -U pip + py.test --basetemp={envtmpdir} diff --git a/tutorial/data/emails_anonymized.pickle b/tutorial/data/emails_anonymized.pickle new file mode 100644 index 00000000..510cc3c9 Binary files /dev/null and b/tutorial/data/emails_anonymized.pickle differ