diff --git a/Makefile b/Makefile
new file mode 100755
index 0000000..dba5abf
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,71 @@
+NAME=master-thesis
+LANG_ASPELL=en
+
+IMG_DIR=img
+SRC_DIR=src
+
+MISC=Makefile
+
+MAJOR_VERSION=0
+MINOR_VERSION=1
+VERSION=$(MAJOR_VERSION).$(MINOR_VERSION)
+
+UNAME=$(shell uname)
+
+BIBTEX=bibtex
+TEX=xelatex
+TEX_FILES=$(wildcard $(SRC_DIR)/*.tex)
+TEX_FLAGS=--shell-escape --synctex=1
+
+all: $(NAME)
+
+$(NAME):
+	@echo "** Compiling the \"$(NAME)\" file."
+	$(TEX) $(TEX_FLAGS) $(NAME)
+
+check: $(addsuffix .spchk,$(basename $(TEX_FILES)))
+
+%.spchk: %.tex
+	aspell -x -l $(LANG_ASPELL) -t -c $<
+
+clean:
+	@echo "** Removing subsidiary TeX files."
+	$(RM) $(NAME).{aux,bbl,blg,lof,log,lom,lot,out,snm,tex.backup,nav,toc}
+
+help:
+	@echo "make COMMAND"
+	@echo
+	@echo "COMMAND:"
+	@echo "  all     compile the \"$(NAME)\" file"
+	@echo "  check   check spelling of TeX files"
+	@echo "  clean   remove subsidiary TeX files"
+	@echo "  help    display available commands"
+	@echo "  full    compile the \"$(NAME)\" file and the bibliography"
+	@echo "  open    open the \"$(NAME)\" fileq"
+	@echo "  tar     create a tar archive"
+
+full:
+	@echo "** Compiling the \"$(NAME)\" file and the bibliography."
+	$(TEX) $(TEX_FLAGS) $(NAME)
+	$(BIBTEX) $(NAME)
+	$(TEX) $(TEX_FLAGS) $(NAME)
+	$(TEX) $(TEX_FLAGS) $(NAME)
+
+open:
+ifeq ($(UNAME), Linux)
+	@echo "** Opening the \"$(NAME)\" file with evince."
+	evince $(NAME).pdf &
+endif
+ifeq ($(UNAME), Darwin)
+	@echo "** Opening the \"$(NAME)\" file with preview."
+	open $(NAME).pdf &
+endif
+
+tar:
+	@echo "** Creating the tar archive."
+	@mkdir -p $(NAME)-$(VERSION)
+	@cp -r $(SRC_DIR) $(IMG_DIR) $(MISC) $(NAME).tex $(NAME)-$(VERSION)
+	tar czvf $(NAME)-$(VERSION).tar.gz $(NAME)-$(VERSION)
+	$(RM) -r $(NAME)-$(VERSION)
+
+.PHONY: all check clean help full open tar $(NAME)
diff --git a/bibli.bib b/bibli.bib
new file mode 100644
index 0000000..108c130
--- /dev/null
+++ b/bibli.bib
@@ -0,0 +1,1077 @@
+@article{DBLP:journals/tacl/BojanowskiGJM17,
+  author    = {Piotr Bojanowski and
+               Edouard Grave and
+               Armand Joulin and
+               Tom{\'{a}}s Mikolov},
+  title     = {Enriching Word Vectors with Subword Information},
+  journal   = {Trans. Assoc. Comput. Linguistics},
+  volume    = {5},
+  pages     = {135--146},
+  year      = {2017},
+  url       = {https://transacl.org/ojs/index.php/tacl/article/view/999},
+  timestamp = {Wed, 17 Feb 2021 21:55:26 +0100},
+  biburl    = {https://dblp.org/rec/journals/tacl/BojanowskiGJM17.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@inproceedings{DBLP:conf/emnlp/LuongPM15,
+  author    = {Thang Luong and
+               Hieu Pham and
+               Christopher D. Manning},
+  editor    = {Llu{\'{\i}}s M{\`{a}}rquez and
+               Chris Callison{-}Burch and
+               Jian Su and
+               Daniele Pighin and
+               Yuval Marton},
+  title     = {Effective Approaches to Attention-based Neural Machine Translation},
+  booktitle = {Proceedings of the 2015 Conference on Empirical Methods in Natural
+               Language Processing, {EMNLP} 2015, Lisbon, Portugal, September 17-21,
+               2015},
+  pages     = {1412--1421},
+  publisher = {The Association for Computational Linguistics},
+  year      = {2015},
+  url       = {https://doi.org/10.18653/v1/d15-1166},
+  doi       = {10.18653/v1/d15-1166},
+  timestamp = {Tue, 28 Jan 2020 10:28:39 +0100},
+  biburl    = {https://dblp.org/rec/conf/emnlp/LuongPM15.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@inproceedings{DBLP:conf/acl/HendrycksLWDKS20,
+  author    = {Dan Hendrycks and
+               Xiaoyuan Liu and
+               Eric Wallace and
+               Adam Dziedzic and
+               Rishabh Krishnan and
+               Dawn Song},
+  editor    = {Dan Jurafsky and
+               Joyce Chai and
+               Natalie Schluter and
+               Joel R. Tetreault},
+  title     = {Pretrained Transformers Improve Out-of-Distribution Robustness},
+  booktitle = {Proceedings of the 58th Annual Meeting of the Association for Computational
+               Linguistics, {ACL} 2020, Online, July 5-10, 2020},
+  pages     = {2744--2751},
+  publisher = {Association for Computational Linguistics},
+  year      = {2020},
+  url       = {https://doi.org/10.18653/v1/2020.acl-main.244},
+  doi       = {10.18653/v1/2020.acl-main.244},
+  timestamp = {Fri, 08 Jan 2021 21:20:23 +0100},
+  biburl    = {https://dblp.org/rec/conf/acl/HendrycksLWDKS20.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@article{article:beseiso,
+author = {Beseiso, Majdi and Alzahrani, Saleh},
+year = {2020},
+month = {11},
+pages = {204-210},
+title = {An Empirical Analysis of BERT Embedding for Automated Essay Scoring},
+volume = {11},
+journal = {International Journal of Advanced Computer Science and Applications},
+doi = {10.14569/IJACSA.2020.0111027}
+}
+
+@inproceedings{DBLP:conf/embc/SahaLG20,
+  author    = {Budhaditya Saha and
+               Sanal Lisboa and
+               Shameek Ghosh},
+  title     = {Understanding patient complaint characteristics using contextual clinical
+               {BERT} embeddings},
+  booktitle = {42nd Annual International Conference of the {IEEE} Engineering in
+               Medicine {\&} Biology Society, {EMBC} 2020, Montreal, QC, Canada,
+               July 20-24, 2020},
+  pages     = {5847--5850},
+  publisher = {{IEEE}},
+  year      = {2020},
+  url       = {https://doi.org/10.1109/EMBC44109.2020.9175577},
+  doi       = {10.1109/EMBC44109.2020.9175577},
+  timestamp = {Fri, 25 Dec 2020 01:14:44 +0100},
+  biburl    = {https://dblp.org/rec/conf/embc/SahaLG20.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+
+@article{DBLP:journals/jodsn/CeravoloAACCDMK18,
+  author    = {Paolo Ceravolo and
+               Antonia Azzini and
+               Marco Angelini and
+               Tiziana Catarci and
+               Philippe Cudr{\'{e}}{-}Mauroux and
+               Ernesto Damiani and
+               Alexandra Mazak and
+               Maurice van Keulen and
+               Mustafa Jarrar and
+               Giuseppe Santucci and
+               Kai{-}Uwe Sattler and
+               Monica Scannapieco and
+               Manuel Wimmer and
+               Robert Wrembel and
+               Fadi A. Zaraket},
+  title     = {Big Data Semantics},
+  journal   = {J. Data Semant.},
+  volume    = {7},
+  number    = {2},
+  pages     = {65--85},
+  year      = {2018},
+  url       = {https://doi.org/10.1007/s13740-018-0086-2},
+  doi       = {10.1007/s13740-018-0086-2},
+  timestamp = {Tue, 08 Sep 2020 13:32:22 +0200},
+  biburl    = {https://dblp.org/rec/journals/jodsn/CeravoloAACCDMK18.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@misc{singhal_2012,
+title={Introducing the Knowledge Graph: things, not strings},
+url={https://www.blog.google/products/search/introducing-knowledge-graph-things-not},
+journal={Google},
+publisher={Google},
+author={Singhal, Amit},
+year={2012},
+month={May},
+note={Accessed: May 2021}
+}
+
+@article{article:genese,
+  author        = {Genet Asefa Gesese and
+                   Russa Biswas and
+                   Mehwish Alam and
+                   Harald Sack},
+  title         = {A Survey on Knowledge Graph Embeddings with Literals: Which model
+                   links better Literal-ly?},
+  journal       = {CoRR},
+  volume        = {abs/1910.12507},
+  year          = {2019},
+  url           = {http://arxiv.org/abs/1910.12507},
+  archivePrefix = {arXiv},
+  eprint        = {1910.12507},
+  timestamp     = {Thu, 31 Oct 2019 14:02:26 +0100},
+  biburl        = {https://dblp.org/rec/journals/corr/abs-1910-12507.bib},
+  bibsource     = {dblp computer science bibliography, https://dblp.org}
+}
+
+@article{article:vandewiele,
+  author        = {Gilles Vandewiele and
+                   Bram Steenwinckel and
+                   Pieter Bonte and
+                   Michael Weyns and
+                   Heiko Paulheim and
+                   Petar Ristoski and
+                   Filip De Turck and
+                   Femke Ongenae},
+  title         = {{Walk Extraction Strategies for Node Embeddings with RDF2Vec in Knowledge
+                   Graphs}},
+  journal       = {CoRR},
+  volume        = {abs/2009.04404},
+  year          = {2020},
+  url           = {https://arxiv.org/abs/2009.04404},
+  archivePrefix = {arXiv},
+  eprint        = {2009.04404},
+  timestamp     = {Thu, 17 Sep 2020 12:49:52 +0200},
+  biburl        = {https://dblp.org/rec/journals/corr/abs-2009-04404.bib},
+  bibsource     = {dblp computer science bibliography, https://dblp.org}
+}
+
+@inproceedings{inproceedings:kristiadi,
+  author    = {Agustinus Kristiadi and
+               Mohammad Asif Khan and
+               Denis Lukovnikov and
+               Jens Lehmann and
+               Asja Fischer},
+  editor    = {Chiara Ghidini and
+               Olaf Hartig and
+               Maria Maleshkova and
+               Vojtech Sv{\'{a}}tek and
+               Isabel F. Cruz and
+               Aidan Hogan and
+               Jie Song and
+               Maxime Lefran{\c{c}}ois and
+               Fabien Gandon},
+  title     = {Incorporating Literals into Knowledge Graph Embeddings},
+  booktitle = {The Semantic Web - {ISWC} 2019 - 18th International Semantic Web Conference,
+               Auckland, New Zealand, October 26-30, 2019, Proceedings, Part {I}},
+  series    = {Lecture Notes in Computer Science},
+  volume    = {11778},
+  pages     = {347--363},
+  publisher = {Springer},
+  year      = {2019},
+  url       = {https://doi.org/10.1007/978-3-030-30793-6\_20},
+  doi       = {10.1007/978-3-030-30793-6\_20},
+  timestamp = {Sat, 30 May 2020 20:05:29 +0200},
+  biburl    = {https://dblp.org/rec/conf/semweb/KristiadiKL0F19.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@inproceedings{vaswani:attention,
+  author    = {Ashish Vaswani and
+               Noam Shazeer and
+               Niki Parmar and
+               Jakob Uszkoreit and
+               Llion Jones and
+               Aidan N. Gomez and
+               Lukasz Kaiser and
+               Illia Polosukhin},
+  editor    = {Isabelle Guyon and
+               Ulrike von Luxburg and
+               Samy Bengio and
+               Hanna M. Wallach and
+               Rob Fergus and
+               S. V. N. Vishwanathan and
+               Roman Garnett},
+  title     = {Attention is All you Need},
+  booktitle = {Advances in Neural Information Processing Systems 30: Annual Conference
+               on Neural Information Processing Systems 2017, December 4-9, 2017,
+               Long Beach, CA, {USA}},
+  pages     = {5998--6008},
+  year      = {2017},
+  url       = {https://proceedings.neurips.cc/paper/2017/hash/3f5ee243547dee91fbd053c1c4a845aa-Abstract.html},
+  timestamp = {Thu, 21 Jan 2021 15:15:21 +0100},
+  biburl    = {https://dblp.org/rec/conf/nips/VaswaniSPUJGKP17.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@article{vaswani,
+  author        = {Cheng{-}Zhi Anna Huang and
+                   Ashish Vaswani and
+                   Jakob Uszkoreit and
+                   Noam Shazeer and
+                   Curtis Hawthorne and
+                   Andrew M. Dai and
+                   Matthew D. Hoffman and
+                   Douglas Eck},
+  title         = {An Improved Relative Self-Attention Mechanism for Transformer with
+                   Application to Music Generation},
+  journal       = {CoRR},
+  volume        = {abs/1809.04281},
+  year          = {2018},
+  url           = {http://arxiv.org/abs/1809.04281},
+  archivePrefix = {arXiv},
+  eprint        = {1809.04281},
+  timestamp     = {Fri, 05 Oct 2018 11:34:52 +0200},
+  biburl        = {https://dblp.org/rec/journals/corr/abs-1809-04281.bib},
+  bibsource     = {dblp computer science bibliography, https://dblp.org}
+}
+
+@inproceedings{bahdanau,
+  author    = {Jan Chorowski and
+               Dzmitry Bahdanau and
+               Dmitriy Serdyuk and
+               Kyunghyun Cho and
+               Yoshua Bengio},
+  editor    = {Corinna Cortes and
+               Neil D. Lawrence and
+               Daniel D. Lee and
+               Masashi Sugiyama and
+               Roman Garnett},
+  title     = {Attention-Based Models for Speech Recognition},
+  booktitle = {Advances in Neural Information Processing Systems 28: Annual Conference
+               on Neural Information Processing Systems 2015, December 7-12, 2015,
+               Montreal, Quebec, Canada},
+  pages     = {577--585},
+  year      = {2015},
+  url       = {http://papers.nips.cc/paper/5847-attention-based-models-for-speech-recognition},
+  timestamp = {Fri, 06 Mar 2020 16:57:49 +0100},
+  biburl    = {https://dblp.org/rec/conf/nips/ChorowskiBSCB15.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@inproceedings{cheng,
+  author    = {Jianpeng Cheng and
+               Li Dong and
+               Mirella Lapata},
+  editor    = {Jian Su and
+               Xavier Carreras and
+               Kevin Duh},
+  title     = {Long Short-Term Memory-Networks for Machine Reading},
+  booktitle = {Proceedings of the 2016 Conference on Empirical Methods in Natural
+               Language Processing, {EMNLP} 2016, Austin, Texas, USA, November 1-4,
+               2016},
+  pages     = {551--561},
+  publisher = {The Association for Computational Linguistics},
+  year      = 2016,
+  url       = {https://doi.org/10.18653/v1/d16-1053},
+  doi       = {10.18653/v1/d16-1053},
+  timestamp = {Tue, 28 Jan 2020 10:28:37 +0100},
+  biburl    = {https://dblp.org/rec/conf/emnlp/0001DL16.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+% RELATED WORK
+
+@inproceedings{khatri,
+  author    = {Akshay Khatri and
+               Pranav P},
+  editor    = {Beata Beigman Klebanov and
+               Ekaterina Shutova and
+               Patricia Lichtenstein and
+               Smaranda Muresan and
+               Chee Wee Leong and
+               Anna Feldman and
+               Debanjan Ghosh},
+  title     = {Sarcasm Detection in Tweets with {BERT} and GloVe Embeddings},
+  booktitle = {Proceedings of the Second Workshop on Figurative Language Processing,
+               Fig-Lang@ACL 2020, Online, July 9, 2020},
+  pages     = {56--60},
+  publisher = {Association for Computational Linguistics},
+  year      = {2020},
+  url       = {https://www.aclweb.org/anthology/2020.figlang-1.7/},
+  timestamp = {Fri, 14 Aug 2020 11:51:21 +0200},
+  biburl    = {https://dblp.org/rec/conf/acl-figlang/KhatriP20.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@inproceedings{peng,
+  author    = {Yifan Peng and
+               Shankai Yan and
+               Zhiyong Lu},
+  editor    = {Dina Demner{-}Fushman and
+               Kevin Bretonnel Cohen and
+               Sophia Ananiadou and
+               Junichi Tsujii},
+  title     = {Transfer Learning in Biomedical Natural Language Processing: An Evaluation
+               of {BERT} and ELMo on Ten Benchmarking Datasets},
+  booktitle = {Proceedings of the 18th BioNLP Workshop and Shared Task, BioNLP@ACL
+               2019, Florence, Italy, August 1, 2019},
+  pages     = {58--65},
+  publisher = {Association for Computational Linguistics},
+  year      = {2019},
+  url       = {https://doi.org/10.18653/v1/w19-5006},
+  doi       = {10.18653/v1/w19-5006},
+  timestamp = {Tue, 28 Jan 2020 10:29:22 +0100},
+  biburl    = {https://dblp.org/rec/conf/bionlp/PengYL19.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@article{DBLP:journals/semweb/RistoskiRNLP19,
+  author    = {Petar Ristoski and
+               Jessica Rosati and
+               Tommaso Di Noia and
+               Renato De Leone and
+               Heiko Paulheim},
+  title     = {RDF2Vec: {RDF} graph embeddings and their applications},
+  journal   = {Semantic Web},
+  volume    = {10},
+  number    = {4},
+  pages     = {721--752},
+  year      = {2019},
+  url       = {https://doi.org/10.3233/SW-180317},
+  doi       = {10.3233/SW-180317},
+  timestamp = {Wed, 25 Sep 2019 17:53:41 +0200},
+  biburl    = {https://dblp.org/rec/journals/semweb/RistoskiRNLP19.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@inproceedings{inproceedings:mikolov,
+  author    = {Tomas Mikolov and
+               Kai Chen and
+               Greg Corrado and
+               Jeffrey Dean},
+  editor    = {Yoshua Bengio and
+               Yann LeCun},
+  title     = {Efficient Estimation of Word Representations in Vector Space},
+  booktitle = {1st International Conference on Learning Representations, {ICLR} 2013,
+               Scottsdale, Arizona, USA, May 2-4, 2013, Workshop Track Proceedings},
+  year      = {2013},
+  url       = {http://arxiv.org/abs/1301.3781},
+  timestamp = {Thu, 25 Jul 2019 14:25:36 +0200},
+  biburl    = {https://dblp.org/rec/journals/corr/abs-1301-3781.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@inproceedings{ethayarajh,
+  author    = {Kawin Ethayarajh},
+  editor    = {Kentaro Inui and
+               Jing Jiang and
+               Vincent Ng and
+               Xiaojun Wan},
+  title     = {How Contextual are Contextualized Word Representations? Comparing
+               the Geometry of BERT, ELMo, and {GPT-2} Embeddings},
+  booktitle = {Proceedings of the 2019 Conference on Empirical Methods in Natural
+               Language Processing and the 9th International Joint Conference on
+               Natural Language Processing, {EMNLP-IJCNLP} 2019, Hong Kong, China,
+               November 3-7, 2019},
+  pages     = {55--65},
+  publisher = {Association for Computational Linguistics},
+  year      = {2019},
+  url       = {https://doi.org/10.18653/v1/D19-1006},
+  doi       = {10.18653/v1/D19-1006},
+  timestamp = {Thu, 12 Dec 2019 13:23:49 +0100},
+  biburl    = {https://dblp.org/rec/conf/emnlp/Ethayarajh19.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@article{distilbert,
+  author    = {Victor Sanh and
+               Lysandre Debut and
+               Julien Chaumond and
+               Thomas Wolf},
+  title     = {DistilBERT, a distilled version of {BERT:} smaller, faster, cheaper
+               and lighter},
+  journal   = {CoRR},
+  volume    = {abs/1910.01108},
+  year      = {2019},
+  url       = {http://arxiv.org/abs/1910.01108},
+  archivePrefix = {arXiv},
+  eprint    = {1910.01108},
+  timestamp = {Tue, 02 Jun 2020 12:48:59 +0200},
+  biburl    = {https://dblp.org/rec/journals/corr/abs-1910-01108.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@article{zhang,
+  author    = {Jiawei Zhang and
+               Haopeng Zhang and
+               Congying Xia and
+               Li Sun},
+  title     = {Graph-Bert: Only Attention is Needed for Learning Graph Representations},
+  journal   = {CoRR},
+  volume    = {abs/2001.05140},
+  year      = {2020},
+  url       = {https://arxiv.org/abs/2001.05140},
+  archivePrefix = {arXiv},
+  eprint    = {2001.05140},
+  timestamp = {Thu, 12 Nov 2020 15:00:59 +0100},
+  biburl    = {https://dblp.org/rec/journals/corr/abs-2001-05140.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@article{article:mukherjee,
+  author        = {Sourav Mukherjee and
+                   Tim Oates and
+                   Ryan Wright},
+  title         = {{Graph Node Embeddings using Domain-Aware Biased Random Walks}},
+  journal       = {CoRR},
+  volume        = {abs/1908.02947},
+  year          = 2019,
+  url           = {http://arxiv.org/abs/1908.02947},
+  archivePrefix = {arXiv},
+  eprint        = {1908.02947},
+  timestamp     = {Fri, 09 Aug 2019 12:15:56 +0200},
+  biburl        = {https://dblp.org/rec/journals/corr/abs-1908-02947.bib},
+  bibsource     = {dblp computer science bibliography, https://dblp.org}
+}
+
+@article{article:hamilton,
+  author    = {William L. Hamilton and
+               Rex Ying and
+               Jure Leskovec},
+  title     = {Representation Learning on Graphs: Methods and Applications},
+  journal   = {{IEEE} Data Eng. Bull.},
+  volume    = {40},
+  number    = {3},
+  pages     = {52--74},
+  year      = {2017},
+  url       = {http://sites.computer.org/debull/A17sept/p52.pdf},
+  timestamp = {Tue, 10 Mar 2020 16:23:49 +0100},
+  biburl    = {https://dblp.org/rec/journals/debu/HamiltonYL17.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@article{article:nickel,
+  author    = {Maximilian Nickel and
+               Kevin Murphy and
+               Volker Tresp and
+               Evgeniy Gabrilovich},
+  title     = {A Review of Relational Machine Learning for Knowledge Graphs},
+  journal   = {Proc. {IEEE}},
+  volume    = {104},
+  number    = {1},
+  pages     = {11--33},
+  year      = {2016},
+  url       = {https://doi.org/10.1109/JPROC.2015.2483592},
+  doi       = {10.1109/JPROC.2015.2483592},
+  timestamp = {Mon, 26 Oct 2020 08:55:17 +0100},
+  biburl    = {https://dblp.org/rec/journals/pieee/Nickel0TG16.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@article{article:ristoski:rdf2vec,
+  author    = {Petar Ristoski and
+               Jessica Rosati and
+               Tommaso Di Noia and
+               Renato De Leone and
+               Heiko Paulheim},
+  title     = {RDF2Vec: {RDF} graph embeddings and their applications},
+  journal   = {Semantic Web},
+  volume    = {10},
+  number    = {4},
+  pages     = {721--752},
+  year      = {2019},
+  url       = {https://doi.org/10.3233/SW-180317},
+  doi       = {10.3233/SW-180317},
+  timestamp = {Wed, 25 Sep 2019 17:53:41 +0200},
+  biburl    = {https://dblp.org/rec/journals/semweb/RistoskiRNLP19.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@article{article:ristoski:semantic:web,
+  author    = {Petar Ristoski and
+               Heiko Paulheim},
+  title     = {Semantic Web in data mining and knowledge discovery: {A} comprehensive
+               survey},
+  journal   = {J. Web Semant.},
+  volume    = {36},
+  pages     = {1--22},
+  year      = {2016},
+  url       = {https://doi.org/10.1016/j.websem.2016.01.001},
+  doi       = {10.1016/j.websem.2016.01.001},
+  timestamp = {Tue, 29 Jan 2019 12:27:04 +0100},
+  biburl    = {https://dblp.org/rec/journals/ws/RistoskiP16.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@article{article:taweel,
+  author        = {Ahmad Al Taweel and
+                   Heiko Paulheim},
+  title         = {{Towards Exploiting Implicit Human Feedback for Improving RDF2vec Embeddings}},
+  journal       = {CoRR},
+  volume        = {abs/2004.04423},
+  year          = {2020},
+  url           = {https://arxiv.org/abs/2004.04423},
+  archivePrefix = {arXiv},
+  eprint        = {2004.04423},
+  timestamp     = {Tue, 14 Apr 2020 01:00:00 +0200},
+  biburl        = {https://dblp.org/rec/journals/corr/abs-2004-04423.bib},
+  bibsource     = {dblp computer science bibliography, https://dblp.org}
+}
+
+@article{article:vries,
+  author    = {Gerben Klaas Dirk de Vries and
+               Steven de Rooij},
+  title     = {Substructure counting graph kernels for machine learning from {RDF}
+               data},
+  journal   = {J. Web Semant.},
+  volume    = {35},
+  pages     = {71--84},
+  year      = {2015},
+  url       = {https://doi.org/10.1016/j.websem.2015.08.002},
+  doi       = {10.1016/j.websem.2015.08.002},
+  timestamp = {Tue, 29 Jan 2019 12:27:03 +0100},
+  biburl    = {https://dblp.org/rec/journals/ws/VriesR15.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@inproceedings{inproceedings:cochez,
+  author    = {Michael Cochez and
+               Petar Ristoski and
+               Simone Paolo Ponzetto and
+               Heiko Paulheim},
+  editor    = {Rajendra Akerkar and
+               Alfredo Cuzzocrea and
+               Jannong Cao and
+               Mohand{-}Said Hacid},
+  title     = {Biased graph walks for {RDF} graph embeddings},
+  booktitle = {Proceedings of the 7th International Conference on Web Intelligence,
+               Mining and Semantics, {WIMS} 2017, Amantea, Italy, June 19-22, 2017},
+  pages     = {21:1--21:12},
+  publisher = {{ACM}},
+  year      = 2017,
+  url       = {https://doi.org/10.1145/3102254.3102279},
+  doi       = {10.1145/3102254.3102279},
+  timestamp = {Fri, 27 Mar 2020 08:55:29 +0100},
+  biburl    = {https://dblp.org/rec/conf/wims/CochezRPP17.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@misc{mccormick,
+author = {Chris McCormick},
+title={{\textit{Word2Vec Tutorial Part 2 - Negative Sampling}}},
+url={http://mccormickml.com/2017/01/11/word2vec-tutorial-part-2-negative-sampling/},
+journal={Word2Vec Tutorial Part 2 - Negative Sampling},
+year={2017},
+month={Jan},
+note={Accessed: April 2021}
+}
+
+@inproceedings{inproceedings:ristoski:strategies,
+  author    = {Petar Ristoski and
+               Heiko Paulheim},
+  editor    = {Ilaria Tiddi and
+               Mathieu d'Aquin and
+               Nicolas Jay},
+  title     = {A Comparison of Propositionalization Strategies for Creating Features
+               from Linked Open Data},
+  booktitle = {Proceedings of the 1st Workshop on Linked Data for Knowledge Discovery
+               co-located with European Conference on Machine Learning and Principles
+               and Practice of Knowledge Discovery in Databases {(ECML} {PKDD} 2014),
+               Nancy, France, September 19th, 2014},
+  series    = {{CEUR} Workshop Proceedings},
+  volume    = {1232},
+  publisher = {CEUR-WS.org},
+  year      = {2014},
+  url       = {http://ceur-ws.org/Vol-1232/paper1.pdf},
+  timestamp = {Wed, 12 Feb 2020 16:44:28 +0100},
+  biburl    = {https://dblp.org/rec/conf/pkdd/RistoskiP14.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@misc{website:fourkind,
+author={Max Pagels},
+title={{\textit{What is Online Machine Learning?}}},
+url={https://medium.com/value-stream-design/online-machine-learning-515556ff72c5},
+journal={Medium},
+publisher={The Hands-on Advisors},
+year={2020},
+month={Feb},
+note={Accessed: March 2021}
+}
+
+@misc{website:deepai:unsupervised:learning,
+author={Thomas Wood},
+title={{\textit{Unsupervised Learning}}},
+howpublished={\href{https://deepai.org/machine-learning-glossary-and-terms/unsupervised-learning}
+{https://deepai.org/machine-learning-glossary-and-terms/unsupervised-learning}},
+journal={DeepAI},
+year={2020},
+month={Aug},
+note={Accessed: March 2021}
+}
+
+@misc{website:deepai:softmax,
+author={Thomas Wood},
+title={{\textit{Softmax Function}}},
+howpublished={\href{https://deepai.org/machine-learning-glossary-and-terms/softmax-layer}
+{https://deepai.org/machine-learning-glossary-and-terms/softmax-layer}},
+journal={DeepAI},
+year=2019,
+month={May},
+note={Accessed: April 2021}
+}
+
+@misc{website:deepai:one:hot:encoding,
+author={DeepAI},
+title={{\textit{One Hot Encoding}}},
+howpublished={\href{https://deepai.org/machine-learning-glossary-and-terms/one-hot-encoding}},
+url={https://deepai.org/machine-learning-glossary-and-terms/one-hot-encoding},
+journal={DeepAI},
+publisher={DeepAI},
+year={2019},
+month={May},
+note={Accessed: April 2021}
+}
+
+@misc{website:deepai:cosine:similarity,
+author={{DeepAI}},
+title={{\textit{Cosine Similarity}}},
+url={https://deepai.org/machine-learning-glossary-and-terms/cosine-similarity},
+journal={DeepAI},
+year={2019},
+month={May},
+note={Accessed: March 2021}
+}
+
+@misc{website:medium:embedding:matrix,
+author={Ashwin Prasad},
+title={{\textit{Word Embeddings Explained}}},
+howpublished={\href{https://medium.com/analytics-vidhya/word-embeddings-explained-62c046f7c79e}},
+url={https://medium.com/analytics-vidhya/word-embeddings-explained-62c046f7c79e},
+journal={Medium},
+publisher={Analytics Vidhya},
+year={2020},
+month={Sep},
+note={Accessed: April 2021}
+}
+
+
+@misc{website:deepai:distributed:representation,
+author={{DeepAI}},
+title={{\textit{Distributed Representations}}},
+url={https://deepai.org/machine-learning-glossary-and-terms/distributed-representation},
+journal={DeepAI},
+year={2019},
+month={May},
+note={Accessed: March 2021}
+}
+
+@misc{website:kamakoti,
+author={Balaji Kamakoti},
+title={{\textit{Introduction to Knowledge Graph Embedding with DGL-KE}}},
+url={https://towardsdatascience.com/introduction-to-knowledge-graph-embedding-with-dgl-ke-77ace6fb60ef},
+journal={Medium},
+publisher={Towards Data Science},
+year={2020},
+month={Jun},
+note={Accessed: April 2021}
+}
+
+@inproceedings{inproceddings:oza,
+  author    = {Nikunj C. Oza},
+  title     = {Online bagging and boosting},
+  booktitle = {Proceedings of the {IEEE} International Conference on Systems, Man
+               and Cybernetics, Waikoloa, Hawaii, USA, October 10-12, 2005},
+  pages     = {2340--2345},
+  publisher = {{IEEE}},
+  year      = {2005},
+  url       = {https://doi.org/10.1109/ICSMC.2005.1571498},
+  doi       = {10.1109/ICSMC.2005.1571498},
+  timestamp = {Wed, 16 Oct 2019 14:14:51 +0200},
+  biburl    = {https://dblp.org/rec/conf/smc/Oza05.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@article{article:hoi,
+  author    = {Steven C. H. Hoi and
+               Doyen Sahoo and
+               Jing Lu and
+               Peilin Zhao},
+  title     = {Online Learning: {A} Comprehensive Survey},
+  journal   = {CoRR},
+  volume    = {abs/1802.02871},
+  year      = {2018},
+  url       = {http://arxiv.org/abs/1802.02871},
+  archivePrefix = {arXiv},
+  eprint    = {1802.02871},
+  timestamp = {Mon, 13 Aug 2018 16:47:30 +0200},
+  biburl    = {https://dblp.org/rec/journals/corr/abs-1802-02871.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@inproceedings{DBLP:conf/nips/MikolovSCCD13,
+  author    = {Tom{\'{a}}s Mikolov and
+               Ilya Sutskever and
+               Kai Chen and
+               Gregory S. Corrado and
+               Jeffrey Dean},
+  editor    = {Christopher J. C. Burges and
+               L{\'{e}}on Bottou and
+               Zoubin Ghahramani and
+               Kilian Q. Weinberger},
+  title     = {Distributed Representations of Words and Phrases and their Compositionality},
+  booktitle = {Advances in Neural Information Processing Systems 26: 27th Annual
+               Conference on Neural Information Processing Systems 2013. Proceedings
+               of a meeting held December 5-8, 2013, Lake Tahoe, Nevada, United States},
+  pages     = {3111--3119},
+  year      = {2013},
+  url       = {https://proceedings.neurips.cc/paper/2013/hash/9aa42b31882ec039965f3c4923ce901b-Abstract.html},
+  timestamp = {Thu, 21 Jan 2021 15:15:23 +0100},
+  biburl    = {https://dblp.org/rec/conf/nips/MikolovSCCD13.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@inproceedings{DBLP:conf/aistats/MorinB05,
+  author    = {Frederic Morin and
+               Yoshua Bengio},
+  editor    = {Robert G. Cowell and
+               Zoubin Ghahramani},
+  title     = {Hierarchical Probabilistic Neural Network Language Model},
+  booktitle = {Proceedings of the Tenth International Workshop on Artificial Intelligence
+               and Statistics, {AISTATS} 2005, Bridgetown, Barbados, January 6-8,
+               2005},
+  publisher = {Society for Artificial Intelligence and Statistics},
+  year      = {2005},
+  url       = {http://www.gatsby.ucl.ac.uk/aistats/fullpapers/208.pdf},
+  timestamp = {Wed, 06 May 2015 20:37:32 +0200},
+  biburl    = {https://dblp.org/rec/conf/aistats/MorinB05.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@inproceedings{DBLP:journals/jmlr/GutmannH10,
+  author    = {Michael Gutmann and
+               Aapo Hyv{\"{a}}rinen},
+  editor    = {Yee Whye Teh and
+               D. Mike Titterington},
+  title     = {Noise-contrastive estimation: {A} new estimation principle for unnormalized
+               statistical models},
+  booktitle = {Proceedings of the Thirteenth International Conference on Artificial
+               Intelligence and Statistics, {AISTATS} 2010, Chia Laguna Resort, Sardinia,
+               Italy, May 13-15, 2010},
+  series    = {{JMLR} Proceedings},
+  volume    = {9},
+  pages     = {297--304},
+  publisher = {JMLR.org},
+  year      = {2010},
+  url       = {http://proceedings.mlr.press/v9/gutmann10a.html},
+  timestamp = {Wed, 29 May 2019 08:41:47 +0200},
+  biburl    = {https://dblp.org/rec/journals/jmlr/GutmannH10.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@misc{weng_2017,
+author={Lilian Weng},
+title={{\textit{Learning Word Embedding}}},
+url={https://lilianweng.github.io/lil-log/2017/10/15/learning-word-embedding.html},
+journal={Lil'Log},
+year={2017},
+month={Oct},
+note={Accessed: April 2021}
+}
+
+@misc{sijun_he,
+author={Sijun He},
+title={{\textit{Word Embeddings}}},
+url={https://sijunhe.github.io/blog/2018/09/12/word-embeddings/},
+journal={Blog | Sijun He},
+year={2018},
+month={Sep},
+note={Accessed: April 2021}
+}
+
+@inproceedings{DBLP:conf/paclic/FanZCZ14,
+  author    = {Miao Fan and
+               Qiang Zhou and
+               Emily Chang and
+               Thomas Fang Zheng},
+  editor    = {Wirote Aroonmanakun and
+               Prachya Boonkwan and
+               Thepchai Supnithi},
+  title     = {Transition-based Knowledge Graph Embedding with Relational Mapping
+               Properties},
+  booktitle = {Proceedings of the 28th Pacific Asia Conference on Language, Information
+               and Computation, {PACLIC} 28, Cape Panwa Hotel, Phuket, Thailand,
+               December 12-14, 2014},
+  pages     = {328--337},
+  publisher = {The {PACLIC} 28 Organizing Committee and {PACLIC} Steering Committee
+               / {ACL} / Department of Linguistics, Faculty of Arts, Chulalongkorn
+               University},
+  year      = {2014},
+  url       = {https://www.aclweb.org/anthology/Y14-1039/},
+  timestamp = {Mon, 16 Sep 2019 17:08:53 +0200},
+  biburl    = {https://dblp.org/rec/conf/paclic/FanZCZ14.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@inproceedings{inproceedings:devlin,
+  author    = {Jacob Devlin and
+               Ming{-}Wei Chang and
+               Kenton Lee and
+               Kristina Toutanova},
+  editor    = {Jill Burstein and
+               Christy Doran and
+               Thamar Solorio},
+  title     = {{BERT:} Pre-training of Deep Bidirectional Transformers for Language
+               Understanding},
+  booktitle = {Proceedings of the 2019 Conference of the North American Chapter of
+               the Association for Computational Linguistics: Human Language Technologies,
+               {NAACL-HLT} 2019, Minneapolis, MN, USA, June 2-7, 2019, Volume 1 (Long
+               and Short Papers)},
+  pages     = {4171--4186},
+  publisher = {Association for Computational Linguistics},
+  year      = {2019},
+  url       = {https://doi.org/10.18653/v1/n19-1423},
+  doi       = {10.18653/v1/n19-1423},
+  timestamp = {Tue, 28 Jan 2020 10:30:29 +0100},
+  biburl    = {https://dblp.org/rec/conf/naacl/DevlinCLT19.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@inproceedings{DBLP:conf/nips/BordesUGWY13,
+  author    = {Antoine Bordes and
+               Nicolas Usunier and
+               Alberto Garc{\'{\i}}a{-}Dur{\'{a}}n and
+               Jason Weston and
+               Oksana Yakhnenko},
+  editor    = {Christopher J. C. Burges and
+               L{\'{e}}on Bottou and
+               Zoubin Ghahramani and
+               Kilian Q. Weinberger},
+  title     = {Translating Embeddings for Modeling Multi-relational Data},
+  booktitle = {Advances in Neural Information Processing Systems 26: 27th Annual
+               Conference on Neural Information Processing Systems 2013. Proceedings
+               of a meeting held December 5-8, 2013, Lake Tahoe, Nevada, United States},
+  pages     = {2787--2795},
+  year      = {2013},
+  url       = {https://proceedings.neurips.cc/paper/2013/hash/1cecc7a77928ca8133fa24680a88d2f9-Abstract.html},
+  timestamp = {Thu, 21 Jan 2021 15:15:23 +0100},
+  biburl    = {https://dblp.org/rec/conf/nips/BordesUGWY13.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@inproceedings{DBLP:conf/esws/SchlichtkrullKB18,
+  author    = {Michael Sejr Schlichtkrull and
+               Thomas N. Kipf and
+               Peter Bloem and
+               Rianne van den Berg and
+               Ivan Titov and
+               Max Welling},
+  editor    = {Aldo Gangemi and
+               Roberto Navigli and
+               Maria{-}Esther Vidal and
+               Pascal Hitzler and
+               Rapha{\"{e}}l Troncy and
+               Laura Hollink and
+               Anna Tordai and
+               Mehwish Alam},
+  title     = {Modeling Relational Data with Graph Convolutional Networks},
+  booktitle = {The Semantic Web - 15th International Conference, {ESWC} 2018, Heraklion,
+               Crete, Greece, June 3-7, 2018, Proceedings},
+  series    = {Lecture Notes in Computer Science},
+  volume    = {10843},
+  pages     = {593--607},
+  publisher = {Springer},
+  year      = {2018},
+  url       = {https://doi.org/10.1007/978-3-319-93417-4\_38},
+  doi       = {10.1007/978-3-319-93417-4\_38},
+  timestamp = {Tue, 14 May 2019 10:00:44 +0200},
+  biburl    = {https://dblp.org/rec/conf/esws/SchlichtkrullKB18.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@inproceedings{DBLP:conf/esws/GeseseBS19,
+  author    = {Genet Asefa Gesese and
+               Russa Biswas and
+               Harald Sack},
+  editor    = {Mehwish Alam and
+               Davide Buscaldi and
+               Michael Cochez and
+               Francesco Osborne and
+               Diego Reforgiato Recupero and
+               Harald Sack},
+  title     = {A Comprehensive Survey of Knowledge Graph Embeddings with Literals:
+               Techniques and Applications},
+  booktitle = {Proceedings of the Workshop on Deep Learning for Knowledge Graphs
+               {(DL4KG2019)} Co-located with the 16th Extended Semantic Web Conference
+               2019 {(ESWC} 2019), Portoroz, Slovenia, June 2, 2019},
+  series    = {{CEUR} Workshop Proceedings},
+  volume    = {2377},
+  pages     = {31--40},
+  publisher = {CEUR-WS.org},
+  year      = {2019},
+  url       = {http://ceur-ws.org/Vol-2377/paper\_4.pdf},
+  timestamp = {Wed, 12 Feb 2020 16:45:11 +0100},
+  biburl    = {https://dblp.org/rec/conf/esws/GeseseBS19.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@misc{alammar,
+title={The Illustrated Transformer},
+url={http://jalammar.github.io/illustrated-transformer/},
+journal={The Illustrated Transformer – Jay Alammar – Visualizing machine learning one concept at a time.},
+author={Jay Alammar},
+year={2018},
+month={Jun},
+note={Accessed: May 2021}
+}
+
+@inproceedings{gehring,
+  author    = {Jonas Gehring and
+               Michael Auli and
+               David Grangier and
+               Denis Yarats and
+               Yann N. Dauphin},
+  editor    = {Doina Precup and
+               Yee Whye Teh},
+  title     = {Convolutional Sequence to Sequence Learning},
+  booktitle = {Proceedings of the 34th International Conference on Machine Learning,
+               {ICML} 2017, Sydney, NSW, Australia, 6-11 August 2017},
+  series    = {Proceedings of Machine Learning Research},
+  volume    = {70},
+  pages     = {1243--1252},
+  publisher = {{PMLR}},
+  year      = {2017},
+  url       = {http://proceedings.mlr.press/v70/gehring17a.html},
+  timestamp = {Wed, 29 May 2019 08:41:45 +0200},
+  biburl    = {https://dblp.org/rec/conf/icml/GehringAGYD17.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@misc{alammar-seq2seq,
+title={Visualizing A Neural Machine Translation Model (Mechanics of Seq2seq Models With Attention)},
+url={https://jalammar.github.io/visualizing-neural-machine-translation-mechanics-of-seq2seq-models-with-attention/}, journal={Visualizing A Neural Machine Translation Model (Mechanics of Seq2seq Models With Attention) – Jay Alammar – Visualizing machine learning one concept at a time.},
+author={Jay Alammar},
+year={2018},
+month={May},
+note={Accessed: May 2021}
+}
+
+@inproceedings{DBLP:conf/ijcai/Tang0C00L20,
+  author    = {Xiaobin Tang and
+               Jing Zhang and
+               Bo Chen and
+               Yang Yang and
+               Hong Chen and
+               Cuiping Li},
+  editor    = {Christian Bessiere},
+  title     = {{BERT-INT:} {A} BERT-based Interaction Model For Knowledge Graph Alignment},
+  booktitle = {Proceedings of the Twenty-Ninth International Joint Conference on
+               Artificial Intelligence, {IJCAI} 2020},
+  pages     = {3174--3180},
+  publisher = {ijcai.org},
+  year      = {2020},
+  url       = {https://doi.org/10.24963/ijcai.2020/439},
+  doi       = {10.24963/ijcai.2020/439},
+  timestamp = {Mon, 20 Jul 2020 12:38:52 +0200},
+  biburl    = {https://dblp.org/rec/conf/ijcai/Tang0C00L20.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@article{DBLP:journals/corr/abs-1909-03193,
+  author    = {Liang Yao and
+               Chengsheng Mao and
+               Yuan Luo},
+  title     = {{KG-BERT:} {BERT} for Knowledge Graph Completion},
+  journal   = {CoRR},
+  volume    = {abs/1909.03193},
+  year      = {2019},
+  url       = {http://arxiv.org/abs/1909.03193},
+  archivePrefix = {arXiv},
+  eprint    = {1909.03193},
+  timestamp = {Tue, 17 Sep 2019 11:23:44 +0200},
+  biburl    = {https://dblp.org/rec/journals/corr/abs-1909-03193.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@inproceedings{DBLP:conf/aaai/LiuZ0WJD020,
+  author    = {Weijie Liu and
+               Peng Zhou and
+               Zhe Zhao and
+               Zhiruo Wang and
+               Qi Ju and
+               Haotang Deng and
+               Ping Wang},
+  title     = {{K-BERT:} Enabling Language Representation with Knowledge Graph},
+  booktitle = {The Thirty-Fourth {AAAI} Conference on Artificial Intelligence, {AAAI}
+               2020, The Thirty-Second Innovative Applications of Artificial Intelligence
+               Conference, {IAAI} 2020, The Tenth {AAAI} Symposium on Educational
+               Advances in Artificial Intelligence, {EAAI} 2020, New York, NY, USA,
+               February 7-12, 2020},
+  pages     = {2901--2908},
+  publisher = {{AAAI} Press},
+  year      = {2020},
+  url       = {https://aaai.org/ojs/index.php/AAAI/article/view/5681},
+  timestamp = {Tue, 02 Feb 2021 08:00:48 +0100},
+  biburl    = {https://dblp.org/rec/conf/aaai/LiuZ0WJD020.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@inproceedings{DBLP:conf/aics/VegupattiNC20,
+  author    = {Mani Vegupatti and
+               Matthias Nickles and
+               Bharathi Raja Chakravarthi},
+  editor    = {Luca Longo and
+               Lucas Rizzo and
+               Elizabeth Hunter and
+               Arjun Pakrashi},
+  title     = {Simple Question Answering Over a Domain-Specific Knowledge Graph using
+               {BERT} by Transfer Learning},
+  booktitle = {Proceedings of The 28th Irish Conference on Artificial Intelligence
+               and Cognitive Science, Dublin, Republic of Ireland, December 7-8,
+               2020},
+  series    = {{CEUR} Workshop Proceedings},
+  volume    = {2771},
+  pages     = {289--300},
+  publisher = {CEUR-WS.org},
+  year      = {2020},
+  url       = {http://ceur-ws.org/Vol-2771/AICS2020\_paper\_42.pdf},
+  timestamp = {Tue, 22 Dec 2020 14:06:50 +0100},
+  biburl    = {https://dblp.org/rec/conf/aics/VegupattiNC20.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@misc{maison,
+title={Python: How To Reduce Memory Consumption By Half By Adding Just One Line Of Code?},
+url={https://www.digitalminds.io/blog/python-how-to-reduce-memory-consumption-by-half-by-adding-just-one-line-of-code},
+journal={digitalminds.io Blog},
+author={Alex Maison},
+year={2019},
+month={Oct}
+}
+
+@inproceedings{pyrdf2vec,
+  author       = {Gilles Vandewiele and Bram Steenwinckel and Terencio Agozzino
+                  and Michael Weyns and Pieter Bonte and Femke Ongenae
+                  and Filip De Turck},
+  title        = {{pyRDF2Vec: Python Implementation and Extension of RDF2Vec}},
+  organization = {IDLab},
+  year         = {2020},
+  url          = {https://github.com/IBCNServices/pyRDF2Vec}
+}
\ No newline at end of file
diff --git a/data/embedders/max-depth/bert.csv b/data/embedders/max-depth/bert.csv
new file mode 100644
index 0000000..d6469c2
--- /dev/null
+++ b/data/embedders/max-depth/bert.csv
@@ -0,0 +1,4 @@
+max_depth,accuracy
+2,70.59
+4,74.26
+6,76.32
diff --git a/data/embedders/max-depth/fasttext.csv b/data/embedders/max-depth/fasttext.csv
new file mode 100644
index 0000000..1491a16
--- /dev/null
+++ b/data/embedders/max-depth/fasttext.csv
@@ -0,0 +1,4 @@
+max_depth,accuracy
+2,79.71
+4,77.06
+6,82.35
diff --git a/data/embedders/max-depth/word2vec.csv b/data/embedders/max-depth/word2vec.csv
new file mode 100644
index 0000000..d7a4516
--- /dev/null
+++ b/data/embedders/max-depth/word2vec.csv
@@ -0,0 +1,4 @@
+max_depth,accuracy
+2,76.76
+4,75.00
+6,74.71
diff --git a/data/embedders/max-walks/bert.csv b/data/embedders/max-walks/bert.csv
new file mode 100644
index 0000000..e662df9
--- /dev/null
+++ b/data/embedders/max-walks/bert.csv
@@ -0,0 +1,5 @@
+max_walk,accuracy
+100,69.43
+250,73.54
+500,75.24
+1000,76.58
diff --git a/data/embedders/max-walks/fasttext.csv b/data/embedders/max-walks/fasttext.csv
new file mode 100644
index 0000000..b5320ac
--- /dev/null
+++ b/data/embedders/max-walks/fasttext.csv
@@ -0,0 +1,5 @@
+max_walk,accuracy
+100,77.94
+250,77.35
+500,76.18
+1000,77.35
diff --git a/data/embedders/max-walks/word2vec.csv b/data/embedders/max-walks/word2vec.csv
new file mode 100644
index 0000000..7180b6d
--- /dev/null
+++ b/data/embedders/max-walks/word2vec.csv
@@ -0,0 +1,5 @@
+max_walk,accuracy
+100,71.47
+250,74.71
+500,73.53
+1000,74.41
diff --git a/data/samplers/max-depth/2/objfreq-inv-split.csv b/data/samplers/max-depth/2/objfreq-inv-split.csv
new file mode 100644
index 0000000..95d44f9
--- /dev/null
+++ b/data/samplers/max-depth/2/objfreq-inv-split.csv
@@ -0,0 +1,4 @@
+max_walk,accuracy
+100,75.88
+250,75.88
+500,74.71
diff --git a/data/samplers/max-depth/2/objfreq-inv.csv b/data/samplers/max-depth/2/objfreq-inv.csv
new file mode 100644
index 0000000..9423ed8
--- /dev/null
+++ b/data/samplers/max-depth/2/objfreq-inv.csv
@@ -0,0 +1,4 @@
+max_walk,accuracy
+100,73.53
+250,75.59
+500,72.65
diff --git a/data/samplers/max-depth/2/objfreq.csv b/data/samplers/max-depth/2/objfreq.csv
new file mode 100644
index 0000000..d98eaed
--- /dev/null
+++ b/data/samplers/max-depth/2/objfreq.csv
@@ -0,0 +1,4 @@
+max_walk,accuracy
+100,77.94
+250,75.59
+500,75.29
diff --git a/data/samplers/max-depth/2/objpredfreq-inv.csv b/data/samplers/max-depth/2/objpredfreq-inv.csv
new file mode 100644
index 0000000..363e3a2
--- /dev/null
+++ b/data/samplers/max-depth/2/objpredfreq-inv.csv
@@ -0,0 +1,4 @@
+max_walk,accuracy
+100,76.76
+250,75.29
+500,73.82
diff --git a/data/samplers/max-depth/2/objpredfreq.csv b/data/samplers/max-depth/2/objpredfreq.csv
new file mode 100644
index 0000000..44bb2de
--- /dev/null
+++ b/data/samplers/max-depth/2/objpredfreq.csv
@@ -0,0 +1,4 @@
+max_walk,accuracy
+100,78.82
+250,75.29
+500,73.53
diff --git a/data/samplers/max-depth/2/pagerank-inv-split.csv b/data/samplers/max-depth/2/pagerank-inv-split.csv
new file mode 100644
index 0000000..035adae
--- /dev/null
+++ b/data/samplers/max-depth/2/pagerank-inv-split.csv
@@ -0,0 +1,4 @@
+max_walk,accuracy
+100,76.18
+250,75.88
+500,73.82
diff --git a/data/samplers/max-depth/2/pagerank-inv.csv b/data/samplers/max-depth/2/pagerank-inv.csv
new file mode 100644
index 0000000..30511d2
--- /dev/null
+++ b/data/samplers/max-depth/2/pagerank-inv.csv
@@ -0,0 +1,4 @@
+max_walk,accuracy
+100,74.41
+250,75.00
+500,74.12
diff --git a/data/samplers/max-depth/2/pagerank-split.csv b/data/samplers/max-depth/2/pagerank-split.csv
new file mode 100644
index 0000000..bc41d17
--- /dev/null
+++ b/data/samplers/max-depth/2/pagerank-split.csv
@@ -0,0 +1,4 @@
+max_walk,accuracy
+100,74.71
+250,75.88
+500,73.82
diff --git a/data/samplers/max-depth/2/pagerank.csv b/data/samplers/max-depth/2/pagerank.csv
new file mode 100644
index 0000000..7451495
--- /dev/null
+++ b/data/samplers/max-depth/2/pagerank.csv
@@ -0,0 +1,4 @@
+max_walk,accuracy
+100,76.47
+250,74.12
+500,75.00
diff --git a/data/samplers/max-depth/2/predfreq-inv.csv b/data/samplers/max-depth/2/predfreq-inv.csv
new file mode 100644
index 0000000..07231f4
--- /dev/null
+++ b/data/samplers/max-depth/2/predfreq-inv.csv
@@ -0,0 +1,4 @@
+max_walk,accuracy
+100,77.65
+250,73.82
+500,72.35
diff --git a/data/samplers/max-depth/2/predfreq.csv b/data/samplers/max-depth/2/predfreq.csv
new file mode 100644
index 0000000..a7e8220
--- /dev/null
+++ b/data/samplers/max-depth/2/predfreq.csv
@@ -0,0 +1,4 @@
+max_walk,accuracy
+100,76.76
+250,76.18
+500,72.65
diff --git a/data/samplers/max-depth/2/uniform.csv b/data/samplers/max-depth/2/uniform.csv
new file mode 100644
index 0000000..ba8e8b7
--- /dev/null
+++ b/data/samplers/max-depth/2/uniform.csv
@@ -0,0 +1,4 @@
+max_walk,accuracy
+100,76.18
+250,75.29
+500,73.53
diff --git a/data/samplers/max-depth/2/widesampler.csv b/data/samplers/max-depth/2/widesampler.csv
new file mode 100644
index 0000000..93bf285
--- /dev/null
+++ b/data/samplers/max-depth/2/widesampler.csv
@@ -0,0 +1,4 @@
+max_walk,accuracy
+100,76.76
+250,77.35
+500,74.71
diff --git a/data/samplers/max-depth/4/objfreq-inv-split.csv b/data/samplers/max-depth/4/objfreq-inv-split.csv
new file mode 100644
index 0000000..7559a6c
--- /dev/null
+++ b/data/samplers/max-depth/4/objfreq-inv-split.csv
@@ -0,0 +1,4 @@
+max_walk,accuracy
+100,72.65
+250,75.59
+500,75.29
diff --git a/data/samplers/max-depth/4/objfreq-inv.csv b/data/samplers/max-depth/4/objfreq-inv.csv
new file mode 100644
index 0000000..56bd8e0
--- /dev/null
+++ b/data/samplers/max-depth/4/objfreq-inv.csv
@@ -0,0 +1,4 @@
+max_walk,accuracy
+100,73.82
+250,76.18
+500,76.18
diff --git a/data/samplers/max-depth/4/objfreq.csv b/data/samplers/max-depth/4/objfreq.csv
new file mode 100644
index 0000000..072309b
--- /dev/null
+++ b/data/samplers/max-depth/4/objfreq.csv
@@ -0,0 +1,4 @@
+max_walk,accuracy
+100,69.12
+250,78.53
+500,74.41
diff --git a/data/samplers/max-depth/4/objpredfreq-inv.csv b/data/samplers/max-depth/4/objpredfreq-inv.csv
new file mode 100644
index 0000000..1ee576b
--- /dev/null
+++ b/data/samplers/max-depth/4/objpredfreq-inv.csv
@@ -0,0 +1,4 @@
+max_walk,accuracy
+100,73.82
+250,70.41
+500,77.94
diff --git a/data/samplers/max-depth/4/objpredfreq.csv b/data/samplers/max-depth/4/objpredfreq.csv
new file mode 100644
index 0000000..b469b0d
--- /dev/null
+++ b/data/samplers/max-depth/4/objpredfreq.csv
@@ -0,0 +1,4 @@
+max_walk,accuracy
+100,75.29
+250,68.82
+500,74.41
diff --git a/data/samplers/max-depth/4/pagerank-inv-split.csv b/data/samplers/max-depth/4/pagerank-inv-split.csv
new file mode 100644
index 0000000..7559a6c
--- /dev/null
+++ b/data/samplers/max-depth/4/pagerank-inv-split.csv
@@ -0,0 +1,4 @@
+max_walk,accuracy
+100,72.65
+250,75.59
+500,75.29
diff --git a/data/samplers/max-depth/4/pagerank-inv.csv b/data/samplers/max-depth/4/pagerank-inv.csv
new file mode 100644
index 0000000..3ce42a2
--- /dev/null
+++ b/data/samplers/max-depth/4/pagerank-inv.csv
@@ -0,0 +1,4 @@
+max_walk,accuracy
+100,75.00
+250,74.71
+500,75.29
diff --git a/data/samplers/max-depth/4/pagerank-split.csv b/data/samplers/max-depth/4/pagerank-split.csv
new file mode 100644
index 0000000..b0607fa
--- /dev/null
+++ b/data/samplers/max-depth/4/pagerank-split.csv
@@ -0,0 +1,4 @@
+max_walk,accuracy
+100,74.12
+250,71.18
+500,77.35
diff --git a/data/samplers/max-depth/4/pagerank.csv b/data/samplers/max-depth/4/pagerank.csv
new file mode 100644
index 0000000..e5fee73
--- /dev/null
+++ b/data/samplers/max-depth/4/pagerank.csv
@@ -0,0 +1,4 @@
+max_walk,accuracy
+100,72.65
+250,73.53
+500,75.59
diff --git a/data/samplers/max-depth/4/predfreq-inv.csv b/data/samplers/max-depth/4/predfreq-inv.csv
new file mode 100644
index 0000000..7bc0f92
--- /dev/null
+++ b/data/samplers/max-depth/4/predfreq-inv.csv
@@ -0,0 +1,4 @@
+max_walk,accuracy
+100,78.24
+250,74.71
+500,74.12
diff --git a/data/samplers/max-depth/4/predfreq.csv b/data/samplers/max-depth/4/predfreq.csv
new file mode 100644
index 0000000..7c0cd85
--- /dev/null
+++ b/data/samplers/max-depth/4/predfreq.csv
@@ -0,0 +1,4 @@
+max_walk,accuracy
+100,73.82
+250,74.41
+500,75.59
diff --git a/data/samplers/max-depth/4/uniform.csv b/data/samplers/max-depth/4/uniform.csv
new file mode 100644
index 0000000..279433d
--- /dev/null
+++ b/data/samplers/max-depth/4/uniform.csv
@@ -0,0 +1,4 @@
+max_walk,accuracy
+100,73.24
+250,74.41
+500,77.35
diff --git a/data/samplers/max-depth/4/widesampler.csv b/data/samplers/max-depth/4/widesampler.csv
new file mode 100644
index 0000000..c0742f2
--- /dev/null
+++ b/data/samplers/max-depth/4/widesampler.csv
@@ -0,0 +1,4 @@
+max_walk,accuracy
+100,78.82
+250,75.00
+500,74.41
diff --git a/data/samplers/max-depth/6/objfreq-inv-split.csv b/data/samplers/max-depth/6/objfreq-inv-split.csv
new file mode 100644
index 0000000..0c48821
--- /dev/null
+++ b/data/samplers/max-depth/6/objfreq-inv-split.csv
@@ -0,0 +1,4 @@
+max_walk,accuracy
+100,74.12
+250,78.53
+500,75.88
diff --git a/data/samplers/max-depth/6/objfreq-inv.csv b/data/samplers/max-depth/6/objfreq-inv.csv
new file mode 100644
index 0000000..f4f2f08
--- /dev/null
+++ b/data/samplers/max-depth/6/objfreq-inv.csv
@@ -0,0 +1,4 @@
+max_walk,accuracy
+100,76.76
+250,76.18
+500,78.82
diff --git a/data/samplers/max-depth/6/objfreq.csv b/data/samplers/max-depth/6/objfreq.csv
new file mode 100644
index 0000000..9ec0528
--- /dev/null
+++ b/data/samplers/max-depth/6/objfreq.csv
@@ -0,0 +1,4 @@
+max_walk,accuracy
+100,75.59
+250,75
+500,78.24
diff --git a/data/samplers/max-depth/6/objpredfreq-inv.csv b/data/samplers/max-depth/6/objpredfreq-inv.csv
new file mode 100644
index 0000000..24fa0e3
--- /dev/null
+++ b/data/samplers/max-depth/6/objpredfreq-inv.csv
@@ -0,0 +1,4 @@
+max_walk,accuracy
+100,75.59
+250,78.24
+500,75.59
diff --git a/data/samplers/max-depth/6/objpredfreq.csv b/data/samplers/max-depth/6/objpredfreq.csv
new file mode 100644
index 0000000..cdbe67e
--- /dev/null
+++ b/data/samplers/max-depth/6/objpredfreq.csv
@@ -0,0 +1,4 @@
+max_walk,accuracy
+100,70.88
+250,73.24
+500,77.06
diff --git a/data/samplers/max-depth/6/pagerank-inv-split.csv b/data/samplers/max-depth/6/pagerank-inv-split.csv
new file mode 100644
index 0000000..e8bb1c6
--- /dev/null
+++ b/data/samplers/max-depth/6/pagerank-inv-split.csv
@@ -0,0 +1,4 @@
+max_walk,accuracy
+100,75.29
+250,76.18
+500,77.94
diff --git a/data/samplers/max-depth/6/pagerank-inv.csv b/data/samplers/max-depth/6/pagerank-inv.csv
new file mode 100644
index 0000000..a9f734a
--- /dev/null
+++ b/data/samplers/max-depth/6/pagerank-inv.csv
@@ -0,0 +1,4 @@
+max_walk,accuracy
+100,76.76
+250,80
+500,77.35
diff --git a/data/samplers/max-depth/6/pagerank-split.csv b/data/samplers/max-depth/6/pagerank-split.csv
new file mode 100644
index 0000000..5abd252
--- /dev/null
+++ b/data/samplers/max-depth/6/pagerank-split.csv
@@ -0,0 +1,4 @@
+max_walk,accuracy
+100,77.06
+250,72.65
+500,76.76
diff --git a/data/samplers/max-depth/6/pagerank.csv b/data/samplers/max-depth/6/pagerank.csv
new file mode 100644
index 0000000..679b1d2
--- /dev/null
+++ b/data/samplers/max-depth/6/pagerank.csv
@@ -0,0 +1,4 @@
+max_walk,accuracy
+100,73.53
+250,77.35
+500,77.35
diff --git a/data/samplers/max-depth/6/predfreq-inv.csv b/data/samplers/max-depth/6/predfreq-inv.csv
new file mode 100644
index 0000000..0072daa
--- /dev/null
+++ b/data/samplers/max-depth/6/predfreq-inv.csv
@@ -0,0 +1,4 @@
+max_walk,accuracy
+100,75.29
+250,78.24
+500,77.35
diff --git a/data/samplers/max-depth/6/predfreq.csv b/data/samplers/max-depth/6/predfreq.csv
new file mode 100644
index 0000000..3ad2e0f
--- /dev/null
+++ b/data/samplers/max-depth/6/predfreq.csv
@@ -0,0 +1,4 @@
+max_walk,accuracy
+100,79.41
+250,75.59
+500,79.40
diff --git a/data/samplers/max-depth/6/uniform.csv b/data/samplers/max-depth/6/uniform.csv
new file mode 100644
index 0000000..9eaa4f1
--- /dev/null
+++ b/data/samplers/max-depth/6/uniform.csv
@@ -0,0 +1,4 @@
+max_walk,accuracy
+100,73.24
+250,75.59
+500,77.94
diff --git a/data/samplers/max-depth/6/widesampler.csv b/data/samplers/max-depth/6/widesampler.csv
new file mode 100644
index 0000000..1ce59be
--- /dev/null
+++ b/data/samplers/max-depth/6/widesampler.csv
@@ -0,0 +1,4 @@
+max_walk,accuracy
+100,76.76
+250,75
+500,78.53
diff --git a/data/walkers/max-depth/anonymous.csv b/data/walkers/max-depth/anonymous.csv
new file mode 100644
index 0000000..40d2a68
--- /dev/null
+++ b/data/walkers/max-depth/anonymous.csv
@@ -0,0 +1,4 @@
+max_depth,accuracy
+2,65.29
+4,66.47
+6,67.65
diff --git a/data/walkers/max-depth/halk.csv b/data/walkers/max-depth/halk.csv
new file mode 100644
index 0000000..8c9d561
--- /dev/null
+++ b/data/walkers/max-depth/halk.csv
@@ -0,0 +1,4 @@
+max_depth,accuracy
+2,75.59
+4,78.82
+6,81.18
diff --git a/data/walkers/max-depth/ngram.csv b/data/walkers/max-depth/ngram.csv
new file mode 100644
index 0000000..30308be
--- /dev/null
+++ b/data/walkers/max-depth/ngram.csv
@@ -0,0 +1,4 @@
+max_depth,accuracy
+2,76.47
+4,75.88
+6,77.65
diff --git a/data/walkers/max-depth/random.csv b/data/walkers/max-depth/random.csv
new file mode 100644
index 0000000..55ecc98
--- /dev/null
+++ b/data/walkers/max-depth/random.csv
@@ -0,0 +1,4 @@
+max_depth,accuracy
+2,77.94
+4,76.76
+6,75.29
diff --git a/data/walkers/max-depth/split.csv b/data/walkers/max-depth/split.csv
new file mode 100644
index 0000000..e971843
--- /dev/null
+++ b/data/walkers/max-depth/split.csv
@@ -0,0 +1,4 @@
+max_depth,accuracy
+2,74.71
+4,77.35
+6,79.71
diff --git a/data/walkers/max-depth/walklet.csv b/data/walkers/max-depth/walklet.csv
new file mode 100644
index 0000000..26c2b4f
--- /dev/null
+++ b/data/walkers/max-depth/walklet.csv
@@ -0,0 +1,4 @@
+max_depth,accuracy
+2,72.06
+4,73.82
+6,71.76
diff --git a/data/walkers/max-walks/anonymous.csv b/data/walkers/max-walks/anonymous.csv
new file mode 100644
index 0000000..69c40ec
--- /dev/null
+++ b/data/walkers/max-walks/anonymous.csv
@@ -0,0 +1,4 @@
+max_walk,accuracy
+100,65.59
+250,66.76
+500,65.80
diff --git a/data/walkers/max-walks/halk.csv b/data/walkers/max-walks/halk.csv
new file mode 100644
index 0000000..b6a5d71
--- /dev/null
+++ b/data/walkers/max-walks/halk.csv
@@ -0,0 +1,4 @@
+max_walk,accuracy
+100,77.65
+250,76.76
+500,79.12
diff --git a/data/walkers/max-walks/ngram.csv b/data/walkers/max-walks/ngram.csv
new file mode 100644
index 0000000..c9b3bec
--- /dev/null
+++ b/data/walkers/max-walks/ngram.csv
@@ -0,0 +1,4 @@
+max_walk,accuracy
+100,68.82
+250,73.24
+500,71.76
diff --git a/data/walkers/max-walks/random.csv b/data/walkers/max-walks/random.csv
new file mode 100644
index 0000000..cca9987
--- /dev/null
+++ b/data/walkers/max-walks/random.csv
@@ -0,0 +1,4 @@
+max_walk,accuracy
+100,72.35
+250,76.18
+500,72.06
diff --git a/data/walkers/max-walks/split.csv b/data/walkers/max-walks/split.csv
new file mode 100644
index 0000000..f35ddeb
--- /dev/null
+++ b/data/walkers/max-walks/split.csv
@@ -0,0 +1,4 @@
+max_walk,accuracy
+100,79.12
+250,77.35
+500,77.35
diff --git a/data/walkers/max-walks/walklet.csv b/data/walkers/max-walks/walklet.csv
new file mode 100644
index 0000000..7e39a1e
--- /dev/null
+++ b/data/walkers/max-walks/walklet.csv
@@ -0,0 +1,4 @@
+max_walk,accuracy
+100,73.82
+250,71.76
+500,77.06
diff --git a/img/embedders/bert/fine-tuning.pdf b/img/embedders/bert/fine-tuning.pdf
new file mode 100644
index 0000000..f308034
Binary files /dev/null and b/img/embedders/bert/fine-tuning.pdf differ
diff --git a/img/embedders/bert/input-embeddings.pdf b/img/embedders/bert/input-embeddings.pdf
new file mode 100644
index 0000000..458e2a5
Binary files /dev/null and b/img/embedders/bert/input-embeddings.pdf differ
diff --git a/img/embedders/bert/pre-training.pdf b/img/embedders/bert/pre-training.pdf
new file mode 100644
index 0000000..52e13c2
Binary files /dev/null and b/img/embedders/bert/pre-training.pdf differ
diff --git a/img/embedders/w2v/cbow.pdf b/img/embedders/w2v/cbow.pdf
new file mode 100644
index 0000000..3d946c7
Binary files /dev/null and b/img/embedders/w2v/cbow.pdf differ
diff --git a/img/embedders/w2v/skip-gram.pdf b/img/embedders/w2v/skip-gram.pdf
new file mode 100644
index 0000000..efe3ea0
Binary files /dev/null and b/img/embedders/w2v/skip-gram.pdf differ
diff --git a/img/license/license.pdf b/img/license/license.pdf
new file mode 100755
index 0000000..4e2d882
Binary files /dev/null and b/img/license/license.pdf differ
diff --git a/img/logo/cti.pdf b/img/logo/cti.pdf
new file mode 100644
index 0000000..1765af8
Binary files /dev/null and b/img/logo/cti.pdf differ
diff --git a/img/logo/heh-technical.pdf b/img/logo/heh-technical.pdf
new file mode 100644
index 0000000..f650c6b
Binary files /dev/null and b/img/logo/heh-technical.pdf differ
diff --git a/img/logo/idlab.pdf b/img/logo/idlab.pdf
new file mode 100644
index 0000000..9699cce
Binary files /dev/null and b/img/logo/idlab.pdf differ
diff --git a/img/logo/imec.pdf b/img/logo/imec.pdf
new file mode 100644
index 0000000..aa41edf
Binary files /dev/null and b/img/logo/imec.pdf differ
diff --git a/img/logo/pole_hainuyer_horizontal.pdf b/img/logo/pole_hainuyer_horizontal.pdf
new file mode 100644
index 0000000..8366aaf
Binary files /dev/null and b/img/logo/pole_hainuyer_horizontal.pdf differ
diff --git a/img/logo/pole_hainuyer_vertical.pdf b/img/logo/pole_hainuyer_vertical.pdf
new file mode 100644
index 0000000..2db0c7d
Binary files /dev/null and b/img/logo/pole_hainuyer_vertical.pdf differ
diff --git a/img/logo/ugent.pdf b/img/logo/ugent.pdf
new file mode 100644
index 0000000..faf0e88
Binary files /dev/null and b/img/logo/ugent.pdf differ
diff --git a/img/logo/wbe_horizontal.pdf b/img/logo/wbe_horizontal.pdf
new file mode 100644
index 0000000..493344a
Binary files /dev/null and b/img/logo/wbe_horizontal.pdf differ
diff --git a/img/logo/wbe_vertical.pdf b/img/logo/wbe_vertical.pdf
new file mode 100644
index 0000000..cb659de
Binary files /dev/null and b/img/logo/wbe_vertical.pdf differ
diff --git a/img/rdf.png b/img/rdf.png
new file mode 100644
index 0000000..ccf3020
Binary files /dev/null and b/img/rdf.png differ
diff --git a/img/scikit-learn.png b/img/scikit-learn.png
new file mode 100644
index 0000000..a57abc2
Binary files /dev/null and b/img/scikit-learn.png differ
diff --git a/img/sparql.png b/img/sparql.png
new file mode 100644
index 0000000..331b64a
Binary files /dev/null and b/img/sparql.png differ
diff --git a/img/transformer/architecture.pdf b/img/transformer/architecture.pdf
new file mode 100644
index 0000000..4689276
Binary files /dev/null and b/img/transformer/architecture.pdf differ
diff --git a/img/transformer/decoder.pdf b/img/transformer/decoder.pdf
new file mode 100644
index 0000000..bc4f44d
Binary files /dev/null and b/img/transformer/decoder.pdf differ
diff --git a/img/transformer/encoder.pdf b/img/transformer/encoder.pdf
new file mode 100644
index 0000000..d5d11ea
Binary files /dev/null and b/img/transformer/encoder.pdf differ
diff --git a/img/transformer/multi-head-attention.pdf b/img/transformer/multi-head-attention.pdf
new file mode 100644
index 0000000..5e68b59
Binary files /dev/null and b/img/transformer/multi-head-attention.pdf differ
diff --git a/img/transformer/scaled-dot-product-attention.pdf b/img/transformer/scaled-dot-product-attention.pdf
new file mode 100644
index 0000000..d642c06
Binary files /dev/null and b/img/transformer/scaled-dot-product-attention.pdf differ
diff --git a/master-thesis.pdf b/master-thesis.pdf
new file mode 100644
index 0000000..d6f26cd
Binary files /dev/null and b/master-thesis.pdf differ
diff --git a/master-thesis.tex b/master-thesis.tex
new file mode 100755
index 0000000..6a145c6
--- /dev/null
+++ b/master-thesis.tex
@@ -0,0 +1,153 @@
+\documentclass[a4paper,12pt]{report}
+\usepackage[hyphens]{url}
+\usepackage[colorlinks=true,allcolors=blueLink]{hyperref}
+\usepackage[margin=2cm]{geometry}
+\usepackage{abstract}
+\usepackage{polyglossia}
+
+\usepackage[binary-units=true,detect-all,group-separator={,},group-minimum-digits=4]{siunitx}
+\usepackage{amsmath,amssymb,amsthm}
+\usepackage[round]{natbib}
+\usepackage{algorithmic}
+\usepackage{algorithm}
+\usepackage{bm}
+\usepackage{booktabs}
+\usepackage{graphicx}
+\usepackage{multirow}
+\usepackage{adjustbox}
+\usepackage{multicol}
+\usepackage{pgf}
+\usepackage{pgfplots}
+\usepackage{tikz}
+\usepackage{xcolor}
+\usepackage{subcaption}
+\usepackage{listings}
+\usepackage{threeparttable}
+\usepackage{listings}
+
+\bibliographystyle{plainnat}
+
+\setmainlanguage{english}
+\setotherlanguage{french}
+
+\theoremstyle{definition}
+\newtheorem{definition}{Definition}[section]
+
+\usetikzlibrary{arrows,automata,calc,decorations.pathreplacing,positioning,shapes,snakes}
+
+\tikzset{%
+  arrow/.style={thick,-stealth},
+  edge/.style={midway,sloped,above},
+  entity/.style={circle,draw},
+  label/.style={yshift=0.2cm},
+}
+
+\definecolor{eclipseComment}{RGB}{63,127,95}
+\definecolor{eclipseKeywords}{RGB}{127,0,85}
+\definecolor{eclipseStrings}{RGB}{42,0.0,255}
+
+\definecolor{blueLink}{HTML}{180CAD}
+\definecolor{dark}{HTML}{7B7D7B}
+\definecolor{grey}{HTML}{C6C7C6}
+\definecolor{green}{HTML}{789437}
+\definecolor{blue}{HTML}{377894}
+\definecolor{red}{HTML}{943778}
+\definecolor{mygreen}{HTML}{bdd7d6}
+\definecolor{myred}{HTML}{dec3d6}
+
+\definecolor{myyellow}{HTML}{d6dec3}
+\definecolor{mygreen}{HTML}{c3ded9}
+\definecolor{myblue}{HTML}{c3d6de}
+\definecolor{darkBlue}{HTML}{5f8ba8}
+\definecolor{mypurple}{HTML}{c3c9de}
+\definecolor{darkPurple}{HTML}{485684}
+\definecolor{myyellow}{HTML}{DED8C3}
+\definecolor{mybrown}{HTML}{DEC3C9}
+\definecolor{darkRed}{HTML}{844871}
+
+\definecolor{dark}{HTML}{7B7D7B}
+\definecolor{eclipseComment}{RGB}{63,127,95}
+\definecolor{eclipseKeywords}{RGB}{127,0,85}
+\definecolor{eclipseStrings}{RGB}{42,0.0,255}
+\definecolor{grey}{HTML}{C6C7C6}
+
+\definecolor{brown}{HTML}{945337}
+\definecolor{medimumBlue}{HTML}{60A8A1}
+\definecolor{yellow}{HTML}{938136}
+
+\definecolor{darkGreen}{HTML}{499336}
+\definecolor{lightGreen}{HTML}{779336}
+\definecolor{mediumGreen}{HTML}{369352}
+
+\lstset{
+  language=Python,
+  backgroundcolor=\color{grey!10},
+  basicstyle=\fontsize{10}{10}\selectfont\ttfamily,
+  breaklines=true,
+  commentstyle=\color{eclipseComment},
+  frame=lines,
+  keywordstyle=\color{eclipseKeywords}\bfseries,
+  morekeywords={get\_balance,get\_estaking,get\_gtime,get\_lstaking,get\_mcost,get\_pcost,IP1,fmincon,inf,linprog,ones},
+  stringstyle=\color{eclipseStrings},
+  numbers=left,
+  numberstyle=\scriptsize,
+  showstringspaces=false,
+}
+% Imposes a default size for all schemas using pgfplot.
+\pgfplotsset{width=8cm,compat=newest}
+\usepgfplotslibrary{units}
+
+\def\keywords#1{\textit{Keywords: }{#1}}
+
+% Defines a horizontal line.
+\newcommand{\HRule}{\rule{\linewidth}{0.5mm}}
+% Automatically increments the number of equations.
+\newcommand\numberthis{\addtocounter{equation}{1}\tag{\theequation}}
+% Constants in mathematics should be in italic.
+\newcommand{\me}{\mathrm{e}}
+% Defines source for figures.
+\newcommand{\source}[1] {
+  \vspace{7pt}\hspace*{15pt}\hbox{\thinspace{\small{Source: #1}}}
+}
+
+% Defines new column types to have table cells of the same size.
+\newcolumntype{C}[1]{>{\centering\let\newline\\\arraybackslash\hspace{0pt}}m{#1}}
+\newcolumntype{L}[1]{>{\raggedright\let\newline\\\arraybackslash\hspace{0pt}}m{#1}}
+\newcolumntype{R}[1]{>{\raggedleft\let\newline\\\arraybackslash\hspace{0pt}}m{#1}}
+
+% Aligns content vertically.
+\newenvironment{vc}{\topskip0pt\vspace*{\fill}\noindent\ignorespaces}{\strut\vfill}
+
+% Uses dashes for the itemize environment.
+\renewcommand\labelitemi{---}
+
+\begin{document}
+\include{src/flyleaf}
+\pagenumbering{arabic}
+\newpage
+\include{src/abstract}
+\newpage
+\include{src/acknowledgments}
+\tableofcontents
+\newpage
+\listoffigures
+\newpage
+\listoftables
+\newpage
+\include{src/introduction}
+\include{src/related-work}
+\include{src/objectives}
+\include{src/background}
+\include{src/embedders}
+\include{src/rdf2vec}
+\include{src/work-performed}
+\include{src/benchmarks}
+\include{src/discussion}
+\include{src/conclusion}
+\include{src/references}
+\end{document}
+
+%%% Local Variables:
+%%% mode: latex
+%%% TeX-master: t
+%%% End:
\ No newline at end of file
diff --git a/res/internship_master-thesis.pdf b/res/internship_master-thesis.pdf
new file mode 100644
index 0000000..2761807
Binary files /dev/null and b/res/internship_master-thesis.pdf differ
diff --git a/res/specs.pdf b/res/specs.pdf
new file mode 100644
index 0000000..fea34a7
Binary files /dev/null and b/res/specs.pdf differ
diff --git a/src/abstract.tex b/src/abstract.tex
new file mode 100644
index 0000000..cee5f55
--- /dev/null
+++ b/src/abstract.tex
@@ -0,0 +1,32 @@
+\begin{abstract}
+\noindent Over the past decade, various use cases have highlighted the benefits
+of converting a Knowledge Graph into a 2D feature matrix, called embedding
+matrix. This conversion can be done with RDF2Vec, an unsupervised task-agnostic
+algorithm for numerically representing Knowledge Graph nodes to be used for
+downstream Machine Learning tasks. Since 2016, this algorithm has provided good
+results using Word2Vec, an embedding technique initially used in the Natural
+Language Processing field. However, other techniques in this field have emerged,
+such as BERT, which since 2018, is the state-of-the-art algorithm. The goal of
+this Master's thesis mainly focused on evaluating BERT for Knowledge Graphs to
+determine its impact compared to Word2Vec and FastText. As a result, this
+Master's thesis proposed an implementation of BERT and FastText related to
+Knowledge Graphs and improving the node embedding's quality generated by
+Word2Vec. For the latter, it was suggested to both extract the root nodes'
+parents and centralize the position of these roots within their walk
+extraction. This Master's thesis also extended the use of RDF2Vec by introducing
+\texttt{SplitWalker} and \texttt{WideSampler} as new walking and sampling
+strategies. The study done reveals that the results obtained by BERT with
+RDF2Vec are not conclusive, contrary to the expectations. The main reason is the
+lack of optimization of the BERT's architecture towards Knowledge Graphs, which
+explains the creation of BERT variants in this direction. Finally, the model's
+accuracy generated by Word2Vec has increased considerably, and both
+\texttt{SplitWalker} and \texttt{WideSampler} have proven their effectiveness in
+certain use cases.
+
+\keywords{BERT, Knowledge Graph, Machine Learning, RDF2Vec, Word2Vec}
+\end{abstract}
+
+%%% Local Variables:
+%%% mode: latex
+%%% TeX-master: "../report"
+%%% End:
diff --git a/src/acknowledgments.tex b/src/acknowledgments.tex
new file mode 100644
index 0000000..354729b
--- /dev/null
+++ b/src/acknowledgments.tex
@@ -0,0 +1,63 @@
+
+\chapter*{Acknowledgements}
+\label{chap:acknowledgements}
+
+First of all, I would like to express my sincere appreciation to my supervisors,
+Dr. Ir. G. \textsc{Vandewiele}, Ing. B. \textsc{Steenwinckel}, and Dr. Ir.
+S. \textsc{Cremer}. My heartfelt thanks to Dr. Ir. G. \textsc{Vandewiele}, who,
+with his experience and knowledge, guided me throughout this Master's
+thesis. Dr. Ir. G. \textsc{Vandewiele} was the supervisor that any student
+would want to have. He was always there to help and give good advice. I would
+also like to thank Ir. B. \textsc{Steenwickel}, who also played an essential
+role in this work. Ir. B. \textsc{Steenwickel} also helped me get the job
+done. Finally, I would like to thank Dr. Ir. S. \textsc{Cremer} for his advice
+in the elaboration of this Master's thesis and for having accepted to supervise
+me.
+
+\medskip
+
+\noindent I would also like to express my gratitude to
+Prof. Dr. F. \textsc{Ongenae}, who gave her approval to realize this Master's
+thesis and directly accepted me with welcome arms.
+
+\medskip
+
+\noindent I wish to extend my special thanks to Dr. P. \textsc{Colpaert} for
+allowing me to get in touch with imec and the IDLab research center. Without his
+help, this experience would not have been possible.
+
+\medskip
+
+\noindent I want to express my gratitude to the Haute École in Hainaut to
+achieve this Master's thesis and to thank every teacher for having passed on
+their knowledge to me. Special attention to Ing. L. \textsc{Isidoro},
+MSc. Ir. J-S. \textsc{Lerat}, BSc. Y. \textsc{Pietrzak},
+MSc. L. \textsc{Remacle}, and Ing. G. \textsc{Tricarico} for their kindness,
+their passion, and for having made me want to continue these studies.
+
+\medskip
+
+\noindent I would also like to thank my friends, whom I have met during these
+years of study and who, through their words, their motivation, and the many
+memories I have shared with them, have made my life better. In particular, I
+would like to thank C. \textsc{Bruyère}, I. \textsc{Delsarte},
+V. \textsc{Denis}, S. \textsc{Eker}, N. \textsc{Luongo}, G. \textsc{Quittet},
+and T. \textsc{Simon}. A particular thought to my childhood friend,
+H. \textsc{Koch}, who followed my evolution since the beginning of my teen years
+and has always been of great help.
+
+\medskip
+
+\noindent Finally, I would like to dedicate this Master's thesis to my family
+and, more particularly, to my great-grandfather and my grandmother. Throughout
+these years of study, they have always supported me. My life would have taken a
+different path without you. I hope that I can one day be as good a person as you
+are.
+
+\thispagestyle{empty}
+\clearpage
+
+%%% Local Variables:
+%%% mode: latex
+%%% TeX-master: "../report"
+%%% End:
\ No newline at end of file
diff --git a/src/background.tex b/src/background.tex
new file mode 100644
index 0000000..7e79974
--- /dev/null
+++ b/src/background.tex
@@ -0,0 +1,32 @@
+
+\chapter{Background}
+\label{chap:background}
+
+This chapter ensures that the reader understands the vocabulary used in this
+document. Specifically, this chapter contains the following five parts:
+
+\begin{multicols}{2}
+  \begin{enumerate}
+  \item \textbf{Graphs}: covers the basic graph types and defines the KG.
+  \item \textbf{ML}: describes the three basic ML paradigms as well as some
+    activation functions.
+  \item \textbf{NLP Techniques}: defined some notions to understand the
+    embedding techniques better.
+    \columnbreak
+  \item \textbf{Attention}: reviews the basics of the attention mechanism
+    helpful in understanding the Transformer architecture.
+  \item \textbf{Transformer}: introduces the Transformer architecture, used used
+    by many recent embedding techniques such as BERT.
+  \end{enumerate}
+\end{multicols}
+
+\input{src/background/graphs}
+\input{src/background/ml}
+\input{src/background/nlp}
+\input{src/background/attention}
+\input{src/background/transformers}
+
+%%% Local Variables:
+%%% mode: latex
+%%% TeX-master: "../report"
+%%% End:
diff --git a/src/background/attention.tex b/src/background/attention.tex
new file mode 100644
index 0000000..b126d9e
--- /dev/null
+++ b/src/background/attention.tex
@@ -0,0 +1,304 @@
+
+\section{Attention}
+\label{sec:attention}
+
+The \emph{Attention} is a Deep Learning mechanism published in 2014 by
+\textsc{Bahdanau} et al. to solve the bottleneck issue of Recurrent Neural
+Networks (RNNs) sequential models widely used for neural machine
+translation~\citep{bahdanau}. To fully understand its mechanism, it is helpful
+to start by introducing the functioning of RNNs.
+
+\subsection{Recurrent Neural Networks}
+\label{subsec:rnns}
+
+Before using such a mechanism, RNNs caused accuracy losses for a model when
+processing long input sequences mainly due to the way RNN encoders generated the
+context vector. Added to that, RNNs are difficult to parallelize.
+
+\begin{figure}[!ht]
+  \centering
+  \begin{tikzpicture}[
+    hid/.style 2 args={rectangle split, rectangle split horizontal, draw=#2,
+      rectangle split parts=#1, fill=#2!20, outer sep=1mm},
+    label/.style={font=\small}]
+    \tikzset{>=stealth',every on chain/.append style={join},
+      every join/.style={->}}
+
+    \foreach \step in {1,2,3,4,5,6,7} {
+      \node[hid={3}{blue}] (w\step) at (2.5*\step, -1.5) {};
+      \node[hid={3}{red}] (o\step) at (2.5*\step, 1.5) {};
+      \node[label,rectangle, draw=grey, fill=grey!20,
+     minimum height=0.22cm,minimum width=1.4cm] (h\step) at (2.5*\step, 0) {\scriptsize{RNN}};
+      \draw[->] (w\step.north) -> (h\step.south);
+      \draw[->] (h\step.north) -> (o\step.south);
+    }
+
+    \foreach \step/\next in {1/2,2/3,3/4,4/5,5/6,6/7}
+    \draw[->] (h\step.east) -> (h\next.west) node[above,midway] {$h_\step$};
+
+    \node[label,below=of w1,yshift=30pt] {hello};
+    \node[label,below=of w2,yshift=28pt] {my};
+    \node[label,below=of w3,yshift=30pt] {dear};
+    \node[label,below=of w4,yshift=28pt] {\texttt{<EOS>}};
+    \node[label,below=of w5,yshift=30pt] {bonjour};
+    \node[label,below=of w6,yshift=28pt] {ma};
+    \node[label,below=of w7,yshift=30pt] {chère};
+
+    \node[label,above=of h1.north,yshift=13pt] {$o_1$};
+    \node[label,above=of h2.north,yshift=13pt] {$o_2$};
+    \node[label,above=of h3.north,yshift=13pt] {$o_3$};
+    \node[label,above=of h4.north,yshift=13pt] {bonjour};
+    \node[label,above=of h5.north,yshift=15pt] {ma};
+    \node[label,above=of h6.north,yshift=15pt] {chère};
+    \node[label,above=of h7.north,yshift=15pt] {\texttt{<EOS>}};
+
+    \draw[loosely dashed, thick] (8.75,3) -- (8.75,1);
+    \draw[loosely dashed, thick] (8.75,-0.5) -- (8.75,-2.5);
+
+    \node[label,above=of h2.north,yshift=35pt] {\textbf{ENCODER}};
+    \node[label,above=of h5.north,yshift=35pt,xshift=30pt] {\textbf{DECODER}};
+  \end{tikzpicture}
+  \caption{Sequence-to-Sequence Learning With RNNs.}
+  \label{fig:rnn}
+\end{figure}
+
+In Figure \ref{fig:rnn}, a Sequence to Sequence (Seq2Seq) model translates an
+English sentence into French using RNN encoders and decoders. Each time step has
+an RNN unit containing an activation function that takes a word embedding and a
+hidden state as input for both encoding and decoding. This hidden state serves
+as a memory to save the entire previous context. Specifically, the hidden state
+at the $t$ time step is computed based on the hidden state at the $t − 1$ time
+step and the current word embedding. Once one RNN encoder reaches the first
+End-Of-Sentence (\texttt{<EOS>}), the RNN decoder receives a context vector
+containing the last generated hidden state, namely $h_3$. Finally, each RNN
+decoder unit translates a word based on this context until to reach once more
+the \texttt{<EOS>} token.
+
+Although RNNs are effective for small sequences, this is not the case for more
+extensive sequences. This mechanism uses a fixed size context vector and
+generates the encoder's hidden states based on the previous hidden
+state. Consequently, the last words of an input sequence have a greater weight
+than the first ones. From this unbalanced weight, processing a long sequence by
+a model comes at the cost of forgetting the earlier parts of that sequence,
+resulting in a loss model's accuracy. RNN variants emerged to reduce this waste
+of information, such as Long Short-Term Memory (LSTM) and Gated Recurrent Units
+(GRU). These variants helped to improve the model's accuracy, but the Attention
+mechanism came up with an interesting idea.
+
+\subsection{Mechanism}
+\label{subsec:attention:mechanism}
+
+The basic idea of the Attention mechanism is not only to pay attention to each
+input word in the context vector but also to give a relative importance to each
+of them~\citep{bahdanau}. In other words, the Attention mechanism focuses on
+matching input and output elements. After its publication, this mechanism became
+one of the design choices in many NLP and Computer Vision tasks. Computer Vision
+is a field of Artificial Intelligence where the computer learns digital images
+or video content. This mechanism has received other
+variants~\citep{DBLP:conf/emnlp/LuongPM15}, which increased the model's accuracy
+in most of the benchmarks that have been performed. Finally, due to the
+popularity of the Attention mechanism, the use of RNNs has been questioned many
+times. However, RNNs are still present in everyday life through various voice
+assistance applications such as Apple's Siri, Amazon Alexa, and Google Home.
+
+With the Attention mechanism, the context vector includes each encoder's hidden
+state. In addition, each decoder's hidden state processes some additional
+calculation to achieve a better model's accuracy compared to the use of RNNs
+without Attention~\citep{alammar-seq2seq}. This mechanism mainly solved the
+previous issues related to the lack of parallelization and forgetting previous
+word contexts for long sequences.
+
+
+\newpage
+
+Visually, the mechanism works as follows:
+\begin{figure}[!ht]
+  \centering
+  \begin{tikzpicture}[
+    hid/.style 2 args={rectangle split, rectangle split horizontal, draw=#2,
+      rectangle split parts=#1, fill=#2!20, outer sep=1mm},
+    label/.style={font=\small}]
+    \tikzset{>=stealth',every on chain/.append style={join},
+      every join/.style={->}}
+
+    \foreach \step in {1,2,3} {
+      \node[label,rectangle, draw=grey, fill=grey!20,
+      minimum height=0.22cm,minimum width=1.35cm] (w\step) at (2.5*\step, -1.5) {\scriptsize{RNN}};
+      \node[hid={3}{blue},below=of w\step,yshift=20pt] (e\step) {};
+      \draw[->] (e\step) -> (w\step);
+    }
+
+
+     \node[label,rectangle, draw=grey, fill=grey!20,
+      minimum height=0.22cm,minimum width=1.35cm] (w4) at (12.25,-1.5) {\scriptsize{RNN}};
+     \node[label,rectangle, draw=grey, fill=grey!20,
+     minimum height=0.22cm,minimum width=1.35cm] (w5) at (14.75,-1.5) {\scriptsize{RNN}};
+
+     \node[right=of w5,xshift=-20pt] (dots) {$\dotso$};
+
+    \node[hid={3}{blue},below=of w4,yshift=20pt] (e4) {};
+    \node[hid={3}{red},below=of w5,yshift=20pt,opacity=0.4] (e5) {};
+    \draw[->] (e4) -> (w4);
+    \draw[->] (e5) -> (w5);
+
+    \foreach \step/\next in {1/2,2/3}
+    \draw[->] (w\step.east) -> (w\next.west) node[above,midway] (h\step) {$h_\step$};
+
+    \draw[->] (w3.east) -> ([xshift=30pt]w3.east) node[above,midway] (h3) {$h_3$};
+    \draw[->] ([xshift=-50pt]w4.west) -> (w4.west) node[above,midway] (c) {$h_1,h_2,h_3$};
+    \draw[->] (w4) -> (w5) node[above,midway] (h4) {$h_4$};
+
+    \foreach \step in {1,2,3}{
+    \node[circle,fill=green!20,draw=green,outer sep=1mm,minimum
+    size=0.8cm,scale=0.8,above=of h\step] (dot\step) {.};
+    \draw[->] (h\step.north) -> (dot\step.south);
+    }
+
+    \node[circle,fill=red!30,draw=red,outer sep=0.1mm,above=of dot1,minimum size=0.8cm,scale=0.85] (s1) {\scriptsize{$9.7$}};
+    \node[circle,fill=red!20,draw=red,outer sep=0.1mm,above=of dot2,minimum size=0.8cm,scale=0.85] (s2) {\scriptsize{$2.1$}};
+    \node[circle,fill=red!10,draw=red,outer sep=0.1mm,above=of dot3,minimum size=0.8cm,scale=0.85] (s3) {\scriptsize{$1.2$}};
+
+    \node[rectangle,draw,above=of s1,minimum width=6cm,minimum height=1cm,xshift=72.5pt] (r) {};
+    \node[rectangle,draw=mygreen,above=of s1,fill=mygreen,minimum height=0.7cm,yshift=0.9pt] (r1) {};
+    \node[rectangle,draw=mygreen,above=of s2,fill=mygreen,minimum height=0.2cm,yshift=0.9pt] (r2) {};
+    \node[rectangle,draw=mygreen,above=of s3,fill=mygreen,minimum height=0.1cm,yshift=0.9pt] (r3) {};
+
+    \node[label,below=of e1,yshift=30pt] {hello};
+    \node[label,below=of e2,yshift=28pt] (my) {my};
+    \node[label,below=of e3,yshift=30pt] {dear};
+    \node[label,below=of e4,yshift=28pt] (eos) {\texttt{<EOS>}};
+    \node[label,below=of e5,yshift=30pt,opacity=0.7] (bonjour) {bonjour};
+
+    \draw[arrow] ([xshift=-20pt]dot3.west) |- (dot2.east);
+    \draw[arrow] ([xshift=-20pt]dot2.west) |- (dot1.east);
+
+    \draw ([xshift=-20pt]dot3.west) |- ([xshift=-20pt,yshift=-25pt]dot3.west);
+    \draw ([xshift=-20pt]dot2.west) |- ([xshift=-20pt,yshift=-30pt]dot2.west);
+
+    \draw[arrow] ([xshift=5pt]h4.north) |- (dot3.east);
+    \draw (h4) |- ([xshift=-20pt,yshift=-25pt]dot3.west);
+    \draw ([xshift=-10pt]h4) |- ([xshift=-20pt,yshift=-30pt]dot2.west);
+
+    \foreach \step in {1,2,3}
+    \draw[arrow] (dot\step) -- ([yshift=-2pt]s\step);
+
+    \foreach \step in {1,2,3}
+    \draw[arrow] (s\step) -- ([yshift=-2pt]r\step.south);
+
+    \node[circle,above=of r1.south,yshift=10pt,fill=green!20,draw=green,outer sep=1mm,minimum size=0.8cm,scale=0.8] (mul1) {x};
+    \node[circle,above=of r2.south,yshift=30pt,fill=green!20,draw=green,outer sep=1mm,minimum size=0.8cm,scale=0.8] (mul2) {x};
+    \node[circle,above=of r3.south,yshift=50pt,fill=green!20,draw=green,outer sep=1mm,minimum size=0.8cm,scale=0.8] (mul3) {x};
+
+    \node[hid={3}{blue},above=of mul1,yshift=28pt] (W1) {};
+    \node[hid={3}{blue},above=of mul2,yshift=8pt] (W2) {};
+    \node[hid={3}{blue},above=of mul3,yshift=-10pt] (W3) {};
+
+    \node[label,above=of W1,yshift=-30pt] {hello};
+    \node[label,above=of W2,yshift=-32pt] {my};
+    \node[label,above=of W3,yshift=-30pt] {dear};
+
+    \foreach \step in {1,2,3}
+    \draw[arrow] (W\step) -- (mul\step);
+
+    \draw[arrow] ([xshift=-2.535cm]r.north) -| (mul1.south);
+    \draw[arrow] ([xshift=-0.05cm]r.north) -| (mul2.south);
+    \draw[arrow] ([xshift=2.4cm]r.north) -| (mul3.south);
+
+    \node[circle,right=of mul2.east,xshift=70pt,fill=green!20,draw=green,outer
+    sep=1mm,minimum size=0.8cm,scale=0.8] (sum) {+};
+
+    \node[hid={3}{red},right=of sum] (output) {};
+    \node[label,above=of output,yshift=-30pt] (output_label) {bonjour};
+    \draw[arrow] (sum) -- (output);
+
+    \draw[arrow] (mul1.east) -| (sum.south);
+    \draw[arrow] (mul2.east) -- (sum);
+    \draw[arrow] (mul3.east) -| (sum.north);
+
+    \node[label,left=of dot1, xshift=0.5cm] {Dot Product};
+    \node[label,left=of s1, xshift=0.45cm] {Attention Score};
+    \node[label,left=of r1, xshift=0.35cm] {Attention Weights};
+    \node[label,left=of mul1,yshift=20pt,xshift=0.55cm] {Weighted Sum};
+
+    \node[label,below=of my.south,yshift=23pt] {\textbf{ENCODER}};
+    \node[label,below=of eos.south,yshift=25pt] {\textbf{DECODER}};
+
+    \node[right=of dot1,xshift=-30pt,yshift=7pt] {\footnotesize{query}};
+    \node[right=of dot2,xshift=-30pt,yshift=7pt] {\footnotesize{query}};
+    \node[right=of dot3,xshift=20pt,yshift=7pt] {\footnotesize{query}};
+
+    \node[above=of h1,xshift=-12pt,yshift=-15pt] {\footnotesize{key}};
+    \node[above=of h2,xshift=-12pt,yshift=-15pt] {\footnotesize{key}};
+    \node[above=of h3,xshift=-12pt,yshift=-15pt] {\footnotesize{key}};
+
+    \node[right=of mul1,xshift=-25pt,yshift=7pt] {\footnotesize{value}};
+    \node[right=of mul2,xshift=-25pt,yshift=7pt] {\footnotesize{value}};
+    \node[right=of mul3,xshift=-25pt,yshift=7pt] {\footnotesize{value}};
+  \end{tikzpicture}
+  \caption{Seq2Seq Learning With Attention Mechanism.}
+  \label{fig:attention}
+\end{figure}
+
+In Figure \ref{fig:attention}, the Attention mechanism starts using the encoder
+RNNs to generate their hidden states, similar to Figure \ref{fig:rnn}. Once
+these hidden states have been generated, the context including these hidden
+states is provided to each time step of the decoder RNNs. For each RNN decoder
+block, the hidden state of decoding is calculated in three significant steps:
+\begin{enumerate}
+\item \textbf{Computation of the Attention/Alignment Score for each word}: to
+  know the most likely word to pay more attention to the translation of the
+  current word, a dot product is made between the previous decoding hidden state
+  with every encoding hidden state.
+
+  The original research paper defines \emph{query}, \emph{key}, and \emph{value}
+  by making an analogy with a database. The query allows performing a search
+  (e.g., book) associated with a set of keys (e.g., book title and abstract) for
+  which each key is associated with a value. From then on, the \emph{Dot-Product
+    Attention} consists of computing the weighted matching between $m$ queries and
+  $n$ keys in an $m$ by $n$ matrix.
+
+\item \textbf{Computation of the Attention Weights}: the Attention scores are
+  normalized with a softmax function in such a way as to ease their processing
+  using a probability distribution whose sum is unitary.
+  \begin{definition}[Attention Score]
+    Let $h_1, \ldots, h_N \in \mathbb{R}^h$ be encoder hidden states, $s_t \in
+    \mathbb{R}^h$ be the decoder hidden state, and $t$ be the time
+    step. Mathematically, the $\mathrm{a}_t$ Attention score is defined as follows:
+    \begin{align}
+      \alpha^t &= \mathrm{softmax}\left(\left[  s_t^Th_1, \ldots, s_t^Th_N \right]\right) \in \mathbb{R}^N \\
+      \mathrm{a}_t &= \sum^N_{i=1}\alpha^t_ih_i \in \mathbb{R}^N
+      \label{eq:def:attention:score}
+    \end{align}
+    \label{def:attention:score}
+  \end{definition}
+
+\item \textbf{Computation of the weighted sum}: After normalization, it is
+necessary to multiply each embedded word with its weight to compute the hidden
+state.
+\end{enumerate}
+
+After the calculation of the hidden states, Attention picks the most likely word
+for translation. This mechanism gives importance to these words by mapping them
+to a higher or lower weight depending on the word’s relevance. Therefore, this
+improves the accuracy of the output predictions~\citep{bahdanau}. However, the
+Attention mechanism in Figure \ref{fig:attention} cannot be used in a neural
+network since there are no weights to train by a model. In practice, the
+Attention mechanism uses three weight matrices to multiply each key, value, and
+query by their respective matrices called the key, query, and value matrices.
+
+Following the publication of Attention, another attention mechanism emerged
+which gives attention within input elements. \emph{Self-Attention} also called
+\emph{intra-attention}, is a Attention mechanism defines in 2016 and relates
+different positions of a single sequence to compute a representation of the same
+sequence~\citep{Cheng}. Unlike the primary Attention mechanism, where the query,
+key, and value matrices may differ, Self-Attention has these three matrices
+identically as generated from the same input sequence. Based on a gradient
+signal, each self-attention block can use this signal to propagate information
+to the weight matrices, namely the key, query, and values matrices. It is
+largely thanks to Self-Attention that the \emph{Transformer} architecture was
+created.
+
+%%% Local Variables:
+%%% mode: latex
+%%% TeX-master: "../../report"
+%%% End:
diff --git a/src/background/graphs.tex b/src/background/graphs.tex
new file mode 100644
index 0000000..d104753
--- /dev/null
+++ b/src/background/graphs.tex
@@ -0,0 +1,147 @@
+
+\section{Graphs}
+\label{sec:background:graphs}
+
+Graphs have been used long before the introduction of ML and cover a wide range
+of applications. To better understand the precise vocabulary of graphs, this
+section provides some definitions.
+
+\begin{definition}[Graph]
+  Ordered pair ($V, E$), where $V$ is a finite and non-empty set of elements
+  called \emph{vertices} (or \emph{nodes}), and $E$ is a set of unordered pairs of
+  distinct nodes of $V$, called \emph{edges}.
+  \label{def:graph}
+\end{definition}
+
+\noindent Basic types of graphs include:
+\begin{figure}[!ht]
+  \begin{minipage}{0.45\linewidth}
+    \centering
+    \begin{tikzpicture}[minimum size=0.5cm,node distance=1cm]
+      \node[entity] (alpha) {1};
+      \node[entity,above left=of alpha,xshift=15pt] (beta) {2};
+      \node[entity,above right=of alpha,yshift=-5pt,xshift=-5pt] (gamma) {3};
+      \node[entity,right=of gamma] (delta) {4};
+
+      \draw[arrow] (alpha) -- (beta);
+      \draw[arrow] (beta) -- (gamma);
+      \draw[arrow] (alpha) -- (gamma);
+      \draw[arrow] (gamma) -- (delta);
+    \end{tikzpicture}
+  \end{minipage}
+  %
+  \begin{minipage}{0.5\linewidth}
+    \begin{definition}[Oriented Graph]
+      Directed Graph where bidirected edges connect no pair of vertices.
+      \label{def:oriented:graph}
+    \end{definition}
+  \end{minipage}
+  \vfill
+  \vspace{\belowdisplayskip}
+  \begin{minipage}{0.45\linewidth}
+    \centering
+    \begin{tikzpicture}[minimum size=0.5cm,node distance=1cm]
+      \node[entity] (alpha) {1};
+      \node[entity,above left=of alpha,xshift=15pt] (beta) {2};
+      \node[entity,above right=of alpha,yshift=-5pt,xshift=-5pt] (gamma) {3};
+      \node[entity,right=of gamma] (delta) {4};
+
+      \draw[arrow] (alpha) -- (beta);
+      \draw[arrow] (beta) -- (gamma);
+      \draw[arrow] (alpha) -- (gamma);
+
+      \path[arrow] let \p1=($(gamma)-(delta)$),\n1={atan2(\y1,\x1)},\n2={180+\n1} in
+      ($ (delta.\n1)!2pt!90:(gamma.\n2) $) edge node {} ($ (gamma.\n2)!2pt!-90:(delta.\n1) $);
+      \path[arrow] let \p1=($(delta)-(gamma)$),\n1={atan2(\y1,\x1)},\n2={180+\n1} in
+      ($ (gamma.\n1)!2pt!90:(delta.\n2) $) edge node {} ($ (delta.\n2)!2pt!-90:(gamma.\n1)
+      $);
+    \end{tikzpicture}
+  \end{minipage}
+  %
+  \begin{minipage}{0.5\linewidth}
+    \begin{definition}[Directed Graph]
+      Named \emph{digraph}, it is defined as a graph whose edges have an
+      orientation, also known as \emph{directed edges}, \emph{directed links},
+      \emph{arrows}, or \emph{arcs}.
+      \label{def:directed:graph}
+    \end{definition}
+  \end{minipage}
+  \caption{Basic Graph Types (Part I).}
+\end{figure}
+
+\newpage
+
+\begin{figure}[!ht]
+  \begin{minipage}{0.45\linewidth}
+    \centering
+    \begin{tikzpicture}[minimum size=0.5cm,node distance=1cm]
+      \node[entity] (alpha) {1};
+      \node[entity,above left=of alpha,xshift=15pt] (beta) {2};
+      \node[entity,above right=of alpha,yshift=-5pt,xshift=-5pt] (gamma) {3};
+      \node[entity,right=of gamma] (delta) {4};
+
+      \draw (alpha) -- (beta);
+      \draw (beta) -- (gamma);
+      \draw (alpha) -- (gamma);
+      \draw (gamma) -- (delta);
+    \end{tikzpicture}
+  \end{minipage}
+  %
+  \begin{minipage}{0.5\linewidth}
+    \begin{definition}[Undirected Graph]
+      Graph whose edges are bidirectional.
+      \label{def:undirected:graph}
+    \end{definition}
+  \end{minipage}
+  \vfill
+  \vspace{\belowdisplayskip}
+  \begin{minipage}{0.45\linewidth}
+    \centering
+    \begin{tikzpicture}[minimum size=0.5cm,node distance=1cm]
+      \node[entity] (alpha) {1};
+      \node[entity,above left=of alpha,xshift=15pt] (beta) {2};
+      \node[entity,above right=of alpha,yshift=-5pt,xshift=-5pt] (gamma) {3};
+      \node[entity,right=of gamma] (delta) {4};
+
+      \draw[thick] (alpha) -- (beta);
+      \draw[thick] (beta) -- (gamma);
+      \draw[thick] (alpha) -- (gamma);
+
+      \path[thick] let \p1=($(gamma)-(delta)$),\n1={atan2(\y1,\x1)},\n2={180+\n1} in
+      ($ (delta.\n1)!2pt!90:(gamma.\n2) $) edge node {} ($ (gamma.\n2)!2pt!-90:(delta.\n1) $);
+      \path[thick] let \p1=($(delta)-(gamma)$),\n1={atan2(\y1,\x1)},\n2={180+\n1} in
+      ($ (gamma.\n1)!2pt!90:(delta.\n2) $) edge node {} ($ (delta.\n2)!2pt!-90:(gamma.\n1)
+      $);
+    \end{tikzpicture}
+  \end{minipage}
+  %
+  \begin{minipage}{0.5\linewidth}
+    \begin{definition}[Multigraph]
+      Undirected graph that can store multiple edges between two nodes.
+      \label{def:multigraph}
+    \end{definition}
+  \end{minipage}
+  \caption{Basic Graph Types (Part II).}
+\end{figure}
+
+\noindent Based on these definitions of graph types, the KG is defined.
+\begin{definition}[Knowledge Graph]
+Directed \emph{heterogeneous} multigraph whose node and relation types have
+domain-specific semantics~\citep{website:kamakoti}. These nodes can be of
+different types. From a terminology point of view, the nodes/vertices of a KG
+are often called \emph{entities}, and the directed edges refer to as
+predicates. Moreover, this multigraph has \emph{triple}\footnote{Also called
+\emph{triplets}.} designing a 3-tuple, where each triple defines a
+(\texttt{subject}, \texttt{predicate}, \texttt{object}) tuple\footnote{Some
+authors define this tuple as \texttt{(h, r, t)}. In this definition, \texttt{h}
+is the head entity, \texttt{t} the tail entity, and \texttt{r} the relation
+associating the head with the tail entities.}. For ease of processing, the
+multigraph aspect of the KG can be removed by representing each triple as two
+2-tuple: (\texttt{subject} $\rightarrow$ \texttt{predicate}) and
+(\texttt{predicate} $\rightarrow$ \texttt{object}).
+\end{definition}
+
+%%% Local Variables:
+%%% mode: latex
+%%% TeX-master: "../../report"
+%%% End:
diff --git a/src/background/ml.tex b/src/background/ml.tex
new file mode 100644
index 0000000..4f5a7d6
--- /dev/null
+++ b/src/background/ml.tex
@@ -0,0 +1,264 @@
+
+\section{Machine Learning}
+\label{sec:background:ml}
+
+Machine Learning (ML) is a branch of AI focusing on improving the automatic
+learning of models without having been explicitly programmed for it. ML comes
+with three basic paradigms: supervised, unsupervised, and reinforcement
+learning. Each of them is defined below.
+
+\begin{multicols}{2}
+  \begin{definition}[Supervised Learning]
+    Type of ML with human supervision, where a model looks for patterns in a data
+    set with labels~\citep{website:deepai:unsupervised:learning}.
+    \label{def:supervised:learning}
+  \end{definition}
+
+  \begin{definition}[Unsupervised Learning]
+    Type of ML with minimal human supervision, where a model looks for patterns
+    in a data set without labels.
+    \label{def:unsupervised:learning}
+  \end{definition}
+\end{multicols}
+
+Visually, supervised learning and unsupervised learning are defined as follows:
+\begin{figure}[!ht]
+  \centering
+  \begin{minipage}{.45\textwidth}
+    \begin{tikzpicture}[baseline,scale=.7]
+      \begin{axis}[
+        height=5.5cm,
+        width=9cm,
+        axis x line=center,
+        axis y line=center,
+        xlabel style={below right},
+        ylabel style={above left},
+        xlabel={$X$},
+        ylabel={$Y$},
+        xtick={4,10},
+        xtick={5,14},
+        clip mode=individual
+        ]
+
+        % Trick to display the graph starting at (1,1)
+        \addplot[green,mark size=0] table [%
+        x = x,
+        y = y,
+        col sep = comma]{
+          x,y
+          0.5,0.5
+        };
+
+        \addplot[blue, only marks, mark=*, mark size=3] table [%
+        x = x,
+        y = y,
+        col sep = comma]{
+          x,y
+          3,3
+          3.5,5
+          4,4.2
+          4.5,3.5
+          4.8,2
+          5,5
+          5.5,4
+          6,4.8
+          6.5,3.5
+        };
+
+        \draw [dashed] (4,11) -- (15,3);
+
+        \addplot[red, only marks, mark=*, mark size=3] table [%
+        x = x,
+        y = y,
+        col sep = comma]{
+          x, y
+          12,12.5
+          12.5,10.5
+          13,9
+          13.5,11
+          14,9.5
+          14.5,8
+          14.7,12
+          14.8,10.7
+          15.5,9.6
+          16,10.8
+        };
+      \end{axis}
+    \end{tikzpicture}
+    \captionof{figure}{Supervised Learning}
+    \label{tikz:background:supervised:learning}
+  \end{minipage}
+  \quad
+  \begin{minipage}{.45\textwidth}
+    \begin{tikzpicture}[baseline,scale=.7]
+      \begin{axis}[
+        height=5.5cm,
+        width=9cm,
+        axis x line=center,
+        axis y line=center,
+        xlabel style={below right},
+        ylabel style={above left},
+        xlabel={$X$},
+        ylabel={$Y$},
+        xtick={4,10},
+        xtick={5,14},
+        clip mode=individual
+        ]
+
+        % Trick to display the graph starting at (1,1)
+        \addplot[mark size=0] table [%
+        x = x,
+        y = y,
+        col sep = comma]{
+          x,y
+          0.5,0.5
+        };
+
+        \addplot[black!70,only marks, mark=*, mark size=3] table [%
+        x = x,
+        y = y,
+        col sep = comma]{
+          x,y
+          3,3
+          3.5,5
+          4,4.2
+          4.5,3.5
+          4.8,2
+          5,5
+          5.5,4
+          6,4.8
+          6.5,3.5
+        };
+
+        \draw [dashed] (5,4) circle[radius=2.8];
+
+        \addplot[black!70,only marks, mark=*, mark size=3] table [%
+        x = x,
+        y = y,
+        col sep = comma]{
+          x, y
+          12,12.5
+          12.5,10.5
+          13,9
+          13.5,11
+          14,9.5
+          14.5,8
+          14.7,12
+          14.8,10.7
+          15.5,9.6
+          16,10.8
+        };
+        \draw [dashed] (14.5,10) circle[radius=2.8];
+      \end{axis}
+    \end{tikzpicture}
+    \captionof{figure}{Unsupervised Learning}
+    \label{tikz:background:unsupervised:learning}
+  \end{minipage}
+  \caption{Learning a Model Using Raspberry and Blueberry Data.}
+  \label{fig:background:supervised:vs:unsupervised}
+\end{figure}
+
+In Figure \ref{tikz:background:supervised:learning}, supervised learning
+contains raspberry and blueberry data already classified by their label. A model
+learns to predict the values based on a \emph{cost function}, which allows it to
+check and correct its predictions according to the actual values. With these
+labeled data, the classification and regression problems can use this type of
+learning.
+
+In Figure \ref{tikz:background:unsupervised:learning}, unsupervised learning
+still contains raspberry and blueberry data. However, this time these data are
+not labeled, which means that a model must find the patterns and structure
+independently. The clustering and association problems use this type of
+learning. For example, this clustering example creates two clusters, with one
+raspberry not belonging to any cluster.
+
+
+\begin{definition}[Reinforcement Learning]
+  Type of ML where a model learns from the experience and feedback of an
+  autonomous agent.
+\end{definition}
+
+\noindent In ML, each neuron inside an Artificial Neural Networks (ANN) outputs
+a number between $-\infty$ and $\infty$ propagating to its predecessor via an
+activation function. Once the ANN has completed its processing through these
+neurons, it generates \emph{logits}, a non-normalized vector of predictions
+generated by a classification model. For ease of processing, it is usually
+appropriate to convert these logits into probabilities to have a unitary
+sum. According to the required type of classification, activation functions such
+as \emph{softmax} and \emph{sigmoid} are helpful. These functions can help with
+various tasks, such as predicting the most likely word to be the missing word in
+a sentence in the Natural Language Processing (NLP) field.
+
+\begin{definition}[Softmax Function]
+  Also called \emph{softargmax}, a logistic regression model uses this
+  multi-classification function for transformation purposes. Specifically, this
+  model transforms a vector of $K$ real values into a vector of $K$ elements that
+  range between 0 and 1 with the particularity of having a unitary
+  sum~\citep{website:deepai:softmax}. Let $\mathbf{z}$ be an input vector and $K$
+  be some classes in a multi-class classifier. Mathematically, the softmax
+  function of these classes is defined as follows:
+  \begin{equation}
+    \mathrm{\sigma}(\mathbf{z})_i = \frac{\me^{z_i}}{\sum^K_{j=1}\me^{z_j}}
+    \label{eq:softmax}
+  \end{equation}
+  \label{def:softmax}
+\end{definition}
+
+\begin{definition}[Sigmoid Function]
+  Binary classification function recognizable with an ``S'' shaped curve used by
+  a logistic regression model for transformation purposes. Unlike the softmax
+  function, this model transforms a vector of $K$ real values into a vector of $K$
+  elements range this time between -1 and 1 with still the particularity of having
+  a unitary sum. Let $\mathbf{x}$ be an input vector. Mathematically, the
+  sigmoid function is defined as follows:
+  \begin{equation}
+    \mathrm{\sigma}(\mathbf{x}) = \frac{1}{1 + e^{-x}} = \frac{e^x}{e^x + 1}
+    \label{eq:sigmoid}
+  \end{equation}
+  \label{def:sigmoid}
+\end{definition}
+
+\begin{definition}[Rectified Linear Unit]
+  Commonly called \emph{ReLU}, the community considers this function as one of
+  the most straightforward functions. Let $\mathbf{x}$ be an input
+  vector. Mathematically, the ReLU function is defined as follows:
+  \begin{equation}
+    \mathrm{f}(\mathbf{x}) = \max(0, \mathbf{x})
+    \label{eq:relu}
+  \end{equation}
+  \label{def:relu}
+\end{definition}
+
+\noindent These functions can help with various tasks, such as predicting the
+most likely word to be the missing word in a sentence in the NLP
+field. Subsequently, in ML, it is essential to know the similarity of two
+vectors. For example, it is helpful in NLP to tell if two words share semantic
+similarities. As a result, the use of cosine similarity is relevant.
+
+\begin{definition}[Cosine Similarity]
+  Measures the cosine of the angle between two non-zero vectors of an inner
+  product space~\citep{website:deepai:cosine:similarity}. Let $\mathbf{u},
+  \mathbf{v} \in \mathbb{R}^d$ be two non-zero $d$-dimensional vectors.
+  Mathematically, the cosine similarity between $\mathbf{u}$ and $\mathbf{v}$ is
+  defined as follows:
+  \begin{align}
+    \cos(\mathbf{u}, \mathbf{v}) =
+    \frac{\mathbf{u} \cdot \mathbf{v}}
+    {\left\lVert \mathbf{u} \right\rVert
+    \left\lVert \mathbf{v} \right\rVert} =
+    \frac{\sum\limits_{i = 1}^n u_i v_i}
+    {\sqrt{\sum\limits_{i = 1}^n u_i^2}
+    \sqrt{\sum\limits_{i = 1}^n v_i^2}}
+    \label{eq:cosine:similarity}
+  \end{align}
+
+  where $\cos(\mathbf{u}, \mathbf{v})$ produces a value ranging from -1 to 1.
+  Specifically, the cosine similarity returns -1 if two vectors do not have any
+  similarities, 0 if they are unrelated, and 1 if they share every similarity.
+  \label{def:cosine:similarity}
+\end{definition}
+
+%%% Local Variables:
+%%% mode: latex
+%%% TeX-master: "../../report"
+%%% End:
diff --git a/src/background/nlp.tex b/src/background/nlp.tex
new file mode 100644
index 0000000..307dbc7
--- /dev/null
+++ b/src/background/nlp.tex
@@ -0,0 +1,154 @@
+
+\section{Natural Language Processing Techniques}
+\label{sec:background:nlp:techniques}
+
+There are many NLP techniques. The objective is not to cover every one of them
+but to define certain notions used by more advanced concepts such as Word2Vec.
+
+\begin{definition}[Distributed Representation]
+  Describes the same data features across multiple scalable and interdependent
+  layers~\citep{website:deepai:distributed:representation}. In a distributed word
+  representation, there is a distribution of the word information across vector
+  dimensions.
+  \label{def:distributed:reprensetation}
+\end{definition}
+
+\begin{definition}[Bag-of-Words]
+  Vectorizes a text by counting the number of unique words (also called
+  \emph{tokens}) in a text. Let ``Suikoden is my favorite game, it is a wonderful
+  game!'' be a text. The representation of this text with Bag-of-Words (BoW) is
+  defined as follows:
+  \begin{equation}
+    \left\{\ \text{Suikoden: } 1,\ \text{is: } 2,\ \text{my: } 1,\ \text{favorite: } 1,\
+      \text{game: } 2,\ \text{it: } 1, \ \text{a: } 1,\ \text{wonderful: } 1\ \right\}
+  \end{equation}
+
+  for which this text can be characterized (e.g., \texttt{[1, 2, 1, 1, 2, 1, 1,
+    1]}) by differences measures (e.g., word frequency).
+  \label{def:bow}
+\end{definition}
+
+\begin{definition}[One-Hot Encoding]
+  Quantifies categorical data as binary vectors. Specifically, the belonging of
+  a data point to the $i$th category implies the acquisition of a zero value for
+  the components of this vector, except for the $i$th component, which receives a
+  unitary value.  Let $K$ be several categories in a data set, and
+  $\mathbf{y}^{(i)}$ be a data point in the $i$th class. Mathematically, the
+  following vectorial representation defines such one-hot encoding:
+  \begin{equation}
+    \textbf{y}^{(i)} = \underbrace{\begin{bmatrix}0 & \ldots & 0 &
+        \underbrace{1}_{\textup{index} \;\; i} &0 & \ldots  &
+        0\end{bmatrix}^\top}_{K \times 1}
+    \label{eq:one-hot:encoding}
+  \end{equation}
+
+  where one-hot encoding vectors allow ML algorithms to make better
+  predictions. Such an encoding does not capture words' semantic and syntactic
+  information. Therefore, it does not detect semantic and order difference between
+  the sentences the ``I like cats more than dogs'' and ``I like dogs more than
+  cats'' As a result, \emph{word embeddings} are privileged to detect these
+  differences and allow a better numerical representation of words.
+  \label{def:one-hot:encoding}
+\end{definition}
+
+\begin{definition}[Word Embeddings]
+  Unsupervised model that captures words' semantic and syntactic information
+  using an \emph{embedding matrix}, where the embeddings of a \textit{w} word are
+  a vector $\mathbf{v}_w$.
+  \label{def:word:embeddings}
+\end{definition}
+
+\begin{definition}[Embedding Matrix]
+  Randomized matrix of dimensions $\mathcal{W} \times \mathcal{F}$, where
+  $\mathcal{W}$ is the number of unique words in a document and $\mathcal{F}$ is
+  the number of features that each unique word in this vocabulary has. The
+  \emph{gradient descent} uses these matrix values to find the minima of a
+  function for several \emph{epochs}, the number of complete cycles on a training
+  data set. From then on, closer words in vector space are assumed to have a
+  similar meaning.
+  \label{def:embedding:matrix}
+\end{definition}
+
+\begin{definition}[Window Size]
+  Determines the context words, also called \emph{training samples}, of a target
+  word from a sliding window along with a sentence. Therefore, a window size of
+  two means 2-triple context words, including the related target word and each of
+  the two words on its left and two on its right. Let ``I will always remember
+  her'' be a sentence. The following table defines the context words for a window
+  size of 2:
+  \begin{table}[!ht]
+    \centering
+    \caption{Context Words Determination for a Window Size of 2. }
+    \label{tab:window:size}
+    \begin{tabular}{ccc}
+      \toprule
+      \textbf{Input Text} & \textbf{Target Word} & \textbf{Context Words} \\
+      \midrule
+      \multirow{2}{*}{\colorbox{blue!25}{I} \colorbox{myblue!25}{will} \colorbox{myblue!25}{always} remember her} & \multirow{2}{*}{i} & will \\
+               & & always \\[1.2ex]
+      \multirow{3}{*}{\colorbox{myblue!25}{I} \colorbox{blue!25}{will} \colorbox{myblue!25}{always} \colorbox{myblue!25}{remember} her} & \multirow{3}{*}{will} & i \\
+                          & & always \\
+                          & & remember \\[1.2ex]
+      \multirow{4}{*}{\colorbox{myblue!25}{I} \colorbox{myblue!25}{will} \colorbox{blue!25}{always} \colorbox{myblue!25}{remember} \colorbox{myblue!25}{her}} & \multirow{4}{*}{always} & i \\
+                          & & will \\
+                          & & remember \\
+                          & & her \\[1.2ex]
+      \multirow{3}{*}{I \colorbox{myblue!25}{will} \colorbox{myblue!25}{always} \colorbox{blue!25}{remember} \colorbox{myblue!25}{her}} & \multirow{3}{*}{remember} & will \\
+                          & & always \\
+                          & & her \\[1.2ex]
+      \multirow{2}{*}{I will \colorbox{myblue!25}{always} \colorbox{myblue!25}{remember} \colorbox{blue!25}{her}} & \multirow{2}{*}{her} & always \\
+                          & & remember \\
+      \bottomrule
+    \end{tabular}
+  \end{table}
+
+  In Table \ref{tab:window:size}, the target word highlighted in blue is
+  modified at each iteration starting from left to right, considering two words
+  forward and backward (highlighted in a lighter blue). As such, the context for a
+  given sentence are known.
+  \label{def:window:size}
+\end{definition}
+
+\begin{definition}[Stop Words]
+  Commonly used words (e.g., ``an'', ``the'', and ``is'') having little
+  value for training a model and are therefore considered noise in a training
+  data set.
+  \label{def:stop:words}
+\end{definition}
+
+  % \begin{definition}[Mean] Averages of some data. Let $n$ be the data and $\mathbf{x}$ be a
+  %   sample. The mean of these data is defined as:
+  %   \begin{equation}
+  %     \mathrm{\mu} = \frac{1}{n}\sum^n_{i=1}x_i
+  %     \label{eq:mean}
+  %   \end{equation}
+  % \label{def:mean}
+  % \end{definition}
+
+  % \begin{definition}[Variance]
+  %   Measures how spread a data point is from an average. Let $\mathrm{\mu}$ be the
+  %   mean of $n$ data. The variance of a random variable $X$ is defined as:
+  %   \begin{equation}
+  %     \mathrm{Var}(X) = E\left[\left(X - \mu\right)^2\right]
+  %     \label{eq:variance}
+  %   \end{equation}
+  %   \label{def:variance}
+  % \end{definition}
+
+  % \begin{definition}[Standard Deviation]
+  %   Measures the dispersion of the data around their center or how they are spread
+  %   out in a data set. Let $\mathrm{Var}$ be the variance of $n$ data. The standard
+  %   deviation is defined as:
+  %   \begin{equation}
+  %     \mathrm{\sigma} = \sqrt{\mathrm{Var}(n)}
+  %     \label{eq:standard:deviation}
+  %     \label{def:standard:deviation}
+  %   \end{equation}
+
+  %   where unlike variance, standard deviation is measured using the same units as the data.
+  % \end{definition}
+
+%%% Local Variables:
+%%% mode: latex
+%%% TeX-master: "../../report"
+%%% End:
diff --git a/src/background/transformers.tex b/src/background/transformers.tex
new file mode 100644
index 0000000..3855760
--- /dev/null
+++ b/src/background/transformers.tex
@@ -0,0 +1,207 @@
+
+\section{Transformer}
+\label{sec:transformer}
+
+The \emph{Transformer} architecture is an alternative to RNNs published in 2017,
+questioning the need for RNNs following the discovery of the Attention
+mechanism~\citep{vaswani:attention}. The original paper presents this
+architecture for Machine Translation and introduces two new Attention
+mechanisms: \emph{Scaled Dot-Product Attention} and \emph{Multi-Head Attention}.
+
+\subsection{Scaled Dot-Product Attention}
+\label{subsec:transformer:scaled:dot-prodct:attention}
+
+\emph{Scaled Dot-Product Attention} provides an Attention score based on a Dot
+Product Attention scaled by the square root inverse of the dimension of the
+query and key matrices. This scaled version prevents the Dot-Product Attention
+mechanism from growing large in magnitude when the dimensionality of the queries
+has a high value. Specifically, this high value is due to the multiplication and
+the addition of inputs, which push the softmax function into regions where its
+gradients are minimal.
+\begin{definition}[Scaled Dot-Product Attention]
+  Let $Q \in \mathbb{R}^{m\times d_k}$ be the matrix of queries and $K \in
+  \mathbb{R}^{n\times d_k}, V \in \mathbb{R}^{m\times d_v}$ be matrices of a set
+  of key-value pairs. Mathematically, the the Scaled Dot-Product Attention
+  mechanism is defined as follows:
+  \begin{equation}
+    \mathrm{A}(Q, K, V) = \mathrm{softmax}\left(\frac{QK^T}{\sqrt{d_k}}\right)V
+    \label{eq:def:scaled:dot-product:attention}
+  \end{equation}
+
+  where $d_k$ is the dimensionality of queries as well as keys and $d_v$ the
+  dimensionality of values.
+\end{definition}
+
+\subsection{Multi-Head Attention}
+\label{subsec:transformer:multi-head:attention}
+
+\emph{Multi-Head Attention} (MHA) is an improved version of the Self-Attention
+mechanism where each word not only brings excessive importance to itself, but
+pays attention to the interactions with other words. As a result, MHA achieves
+better results than Self-Attention, where the latter leads to a loss of
+efficiency in the Attention embeddings. Finally, MHA still performs a weighted
+average to generate the Attention vector for each token. However, the heads are
+this time computed in parallel.
+
+\begin{definition}[Multi-Head Attention]
+  Let $f_o: \mathbb{R}^{kd} \to \mathbb{R}^D$ be a linear map, $f_{i,q}, f_{i,
+    k}, f_{i, v}: \mathbb{R}^D \to \mathbb{R}^d$ be a three sets of linear maps,
+  $\hat{Q} \in \mathbb{R}^{M\times D}$ be a queries vector, and $\hat{K}, \hat{V}
+  \in \mathbb{R}^{N\times D}$ be vectors of keys and values. Mathematically, the
+  MHA is defined as follows:
+  \[
+    \textrm{MHA}(\hat{Q}, \hat{K}, \hat{V}) = f_o \left (
+      \text{concat} \left (
+        \left [
+          A \left (
+            f_{i,q}(\hat{Q}),
+            f_{i,k}(\hat{K}),
+            f_{i,v}(\hat{V})
+          \right )
+          \
+          \forall \ i \in 1,\dots, k
+        \right ]
+      \right )
+    \right )
+  \]
+  where $\text{MHA}(\hat{Q}, \hat{K}, \hat{V}) \in \mathbb{R}^{M \times D}$ is
+  the output of the MHA, also known as a \emph{neural function} with $k$-headed
+  attention block.
+\end{definition}
+
+\subsection{Architecture}
+\label{subsec:transformer:architecture}
+
+The Transformer architecture mainly consists of an encoder and a decoder block,
+including the Scaled Dot-Product and MHA mechanisms. Specifically, the encoder
+and decoder are neural components composed of a stack of $N=6$ identical layers.
+
+Visually, one layer of the Transformer works as follows:
+\begin{figure}[!ht]
+  \centering
+  \subfloat[Encoder\label{fig:transformer:encoder}]{%
+    \includegraphics[width=0.23\textwidth]{img/transformer/encoder}
+  }
+  \quad
+  \subfloat[Decoder\label{fig:transformer:decoder}]{%
+    \includegraphics[width=0.23\textwidth]{img/transformer/decoder}
+  }
+  \caption{One Layer of the Architecture of the Transformer.}
+  \label{fig:transformer:architecture:encoder}
+  \source{\textsc{Vaswani} et al. -- Attention Is All You Need.}
+\end{figure}
+
+In Figure \ref{fig:transformer:encoder}, one layer of the encoder contains two
+sub-layers: a MHA mechanism and a position-wise fully connected Feed-Forward
+Neural Network (FFN).
+\begin{definition}[Feed-Forward Neural Network]
+  Let $W_1 + b_1$ and $W_2 + b_2$ be two linear transformations. Mathematically,
+  the FFN is defined as follows:
+  \begin{equation}
+    \mathrm{FFN}(x) = \max\left(0, xW_1 + b_1\right)W_2 + b_2
+    \label{eq:def:ffn}
+  \end{equation}
+  where a ReLU activation function is used.
+  \label{def:ffn}
+\end{definition}
+
+Moreover, a residual connection followed by a normalization of the layers wraps
+each of the two sublayers. Mathematically, the output of every sub-layer is
+defined as follows:
+\begin{equation}
+  \mathrm{LayerNorm}(x + \mathrm{Sublayer}(x))
+  \label{eq:output:sub-layer}
+\end{equation}
+where $\mathrm{Sublayer}(x)$ refers to the implemented function by the sub-layer
+itself. \\
+
+
+\noindent In Figure \ref{fig:transformer:decoder}, the decoder has layers that
+differ from the encoder with two sub-layer changes. The first sublayer change is
+defined using \emph{Masked MHA} as the first sub-layer to prevent positions from
+attending subsequent positions. The last change is adding a new sub-layer,
+called \emph{Encoder-Decoder Attention} which performs MHA over the Attention
+vectors of the decoder and those of the encoder. The use of this sub-layer makes
+it possible to determine the different relationships between these
+vectors. Finally, the generated embeddings are sent to a position-wise fully
+connected FFN. Like the encoder, each sub-layer is wrapped by a residual
+connection followed by a layer normalization.
+
+From then one, the complete architecture of the Transformer is illustrated as follows:
+\begin{figure}[!ht]
+  \centering
+  \includegraphics[width=0.6\textwidth]{img/transformer/architecture}
+  \caption{Model Architecture of the Transformer.}
+  \source{\textsc{Vaswani} et al. -- Attention Is All You Need}
+  \label{fig:transformer:architecture}
+\end{figure}
+
+\newpage
+
+In Figure \ref{fig:transformer:architecture}, both encoding and decoding take
+the sum of the input/output embeddings with positional embedding as
+input. Unlike RNNs, Transformer do not have a time step so that input sequences
+can be injected simultaneously and converted into input embeddings. Each token
+is represented in the embedding space through these input embeddings where
+semantically similar tokens are closer than others. However, the decoder has the
+particularity of having its input shifted. This shift avoids that a model only
+learns to copy the decoder input by allowing a model to predict the target
+word/character for position $i$ according to the previous one from 1 to $i-1$.
+
+At last, the output of the decoding layer is sent to a linear layer. This layer
+is nothing more than an FFN layer whose objective is to extend the dimension of
+this vector to the number of words of the language concerned. Once expanded,
+this vector is submitted to a softmax activation function to transform it into a
+probability distribution and predict the next word according to the word with
+the highest probability. Afterward, this decoding process is iterated several
+times until the end-of-sentence token is generated.
+
+\subsection{Positional Encoding}
+\label{subsec:transformer:positional:encoding}
+
+Considering that the transformer has no recurrence or convolution mechanism, the
+architecture cannot know the order in an input sequence. From then on, the
+transformer encodes the same meaning for different sentences. The easiest way to
+take this context into account is to encode these positions as one-hot features.
+\begin{definition}[Positional Encoding as One-Hot Features]
+  Let $x \in \mathbb{R}^{n \times d}$ be a matrice of sequentially ordered data
+  along the $n$-dimensional axis, and $e_k$ be a $k$'th standard basis vector in
+  $\mathbb{R}^n$. Mathematically, the $z \in \mathbb{R}^{n \times d}$ learned
+  combined representation is defined as follows:
+  \begin{equation}
+    \mathrm{z}_k = W^T_zReLU\left(W_x^Tx_k + W_e^Te_k\right), W_x \in
+    \mathbb{R}^{dim(x)\times m}, W_e \in \mathbb{R}^{n\times m}, W_z \in \mathbb{R}^{m\times d}
+    \label{eq:def:positional:encoding:one:hot}
+  \end{equation}
+  \label{def:positional:encoding:one:hot}
+\end{definition}
+
+\noindent Another approach for positional encoding is to build distinct
+representations of inputs and positions~\citep{gehring}. Despite these existing
+approaches to differentiate meanings, the original Transformer paper uses a
+sinusoid-wave-based positional encoding to inject absolute positional
+information of tokens into the sequence.
+
+\begin{definition}[Positional Encoding by \textsc{Vaswani} et al.]
+  Let $d_{model}$ be the embedding dimension of words, and $pos \in \left[0, L -
+    1\right]$ be the position of a $w$ word in the $w = (w_0,\dotsc,w_{L-1})$ input
+  sequence. Mathematically, the positional encoding of $w$ is defined as follows:
+  \begin{align}
+    \mathrm{PE}(pos,i) = \begin{cases}
+      \sin\displaystyle\left(\frac{pos}{10000^{2i/d_{model}}}\right)\,, & i = 2k \\\\
+      \cos\displaystyle{\left(\frac{pos}{10000^{2i/d_{model}}}\right)}\,, & i = 2k + 1
+    \end{cases} \   k \in \mathbb{N}
+    \label{eq:positional:encoding}
+  \end{align}
+
+  where the positional encoding follows a specific, learned pattern to identify
+  word position or the distance between words in the sequence~\citep{alammar}.  In
+  Equation \ref{eq:positional:encoding}, the sinusoidal representation works as
+  well as a learned representation and better generalizes sequences that are
+  longer than the training sequences~\citep{vaswani:attention}.
+\end{definition}
+
+%%% Local Variables:
+%%% mode: latex
+%%% TeX-master: "../../report"
+%%% End:
diff --git a/src/benchmarks.tex b/src/benchmarks.tex
new file mode 100644
index 0000000..cff8b33
--- /dev/null
+++ b/src/benchmarks.tex
@@ -0,0 +1,42 @@
+
+\chapter{Benchmarks}
+\label{chap:benchmarks}
+
+For these benchmarks, only the maximum number of walks per entity and the
+maximum depth per walk is varied. Furthermore, due to a time issue (cf. Section
+\ref{sec:objectives:problems}) these benchmarks are performed on \texttt{MUTAG},
+a graph of moderate size composed of
+\SI{74567}{triples}\footnote{\textbf{SELECT} (COUNT(*) AS ?triples)
+\textbf{WHERE} \{ ?s ?p ?o \}} \SI{22534}{entities }\footnote{\textbf{SELECT}
+(COUNT(\textbf{DISTINCT} ?s) AS ?entities) \textbf{WHERE} \{ ?s a \}}, and
+\SI{24}{relations}\footnote{\textbf{SELECT} (COUNT(\textbf{DISTINCT} ?p) AS
+?relations) \textbf{WHERE} \{ ?s ?p ?o \}}. Finally, each value entered in these
+benchmarks is the result of the average of five values.
+
+\section{Setup}
+\label{sec:setup}
+
+Benchmarks related to embeddings techniques and walking strategies are directly
+launched on IDLab's\footnote{Research group of imec.} servers with \SI{4}{CPUs},
+\SI{64}{\giga\byte} RAM, and one GPU. Those related to sampling strategies are
+launched directly on a ThinkPad machine with \SI{4}{CPUs} and
+\SI{16}{\giga\byte} of RAM. This physical device change is made since, except
+for \texttt{UniformSampler}, the sampling strategies only work on locally stored
+KGs. As the IDLab servers interact with the KGs via SPARQL endpoints, they were
+not used for these benchmarks. Finally, the benchmarks use 340 training entities
+and attempt to predict 68 test entities, which is a standard for \texttt{MUTAG}.
+
+\section{Results}
+\label{sec:results}
+
+This section contains the results of the different embedding techniques, walking
+strategies, and sampling strategies for \texttt{MUTAG}.
+
+\input{src/benchmarks/embedders}
+\input{src/benchmarks/walkers}
+\input{src/benchmarks/samplers}
+
+%%% Local Variables:
+%%% mode: latex
+%%% TeX-master: "../master-thesis"
+%%% End:
\ No newline at end of file
diff --git a/src/benchmarks/embedders.tex b/src/benchmarks/embedders.tex
new file mode 100644
index 0000000..c318309
--- /dev/null
+++ b/src/benchmarks/embedders.tex
@@ -0,0 +1,237 @@
+
+\subsection{Embedding Techniques}
+\label{subsec:embedding:techniques}
+
+FastText, BERT and Word2Vec are trained on the basis of ten epochs. In addition,
+FastText and Word2Vec use twenty negative words with a vector size of 500. For
+its splitting function, FastText uses a primary splitting function where the
+\texttt{\#} symbol splits each entity Finally, each embedding technique uses a
+\texttt{RandomWalker} and an \texttt{UniformSampler}.
+
+About the training of BERT, the latter is only trains on
+\texttt{MUTAG}. However, in case of multiple KGs, it is important to re-train it
+for each different KGs. Similarly, if BERT is trained with too few walks, it
+will be necessary to retrain it with a larger number of walks. In this case,
+online learning is important to avoid to retrain the whole model which can be
+time-consuming. Unline BERT that can take hours, days for training the model,
+Word2Vec and FastText take a few minutes to tens of minutes to train, which is
+significant difference.
+
+\begin{table}[!ht]
+  \centering
+  \begin{tabular}{rl}
+    \toprule
+    \textbf{Hyperparameter} & \textbf{Value} \\
+    \midrule
+    \textbf{Epochs} & 10 \\
+    \textbf{Warmup Steps} & 500 \\
+    \textbf{Weight Decay} & 0.01 \\
+    \textbf{Learning Rate} & 2e-5 \\
+    \textbf{Batch Size} & 16 \\
+    \bottomrule
+  \end{tabular}
+  \caption{Basic Hyperparameters Used for Training the BERT Model.}
+  \label{tab:bert:hyperparameters}
+\end{table}
+
+In Table \ref{tab:bert:hyperparameters}, The values of these basic
+hyperparameters were chosen after several tests using a Grid Search with Cross
+Validation and depending on the training time. More precisely, a training of the
+BERT model with these hyperparameters with MUTAG and the same hardware
+characteristics as those of the IDLab servers, is done between 25 minutes and
+few hours.
+
+\begin{table}[!ht]
+  \centering
+  \resizebox{\textwidth}{!}{%
+    \begin{tabular}{cccS[table-format=2.2]@{${}\pm{}$}S[table-format=1.1]c}
+      \toprule
+      \textbf{Embedding Technique} & \textbf{Max. Depth} & \textbf{Max. Walks}
+      & \multicolumn{2}{c}{\textbf{Accuracy} (\SI{}{\percent})} & \textbf{Rank} \\
+      \midrule
+      \texttt{FastText(negative=20,vector\_size=500)} & \multirow{3}{*}{2} & \multirow{3}{*}{250} & 79.71 & 2.35 & 1 \\
+      \texttt{Word2Vec(negative=20,vector\_size=500)} & & & 76.76 & 1.71 & 2 \\
+      \texttt{BERT(learning\_rate=2e-5,batch\_size=16)} & & & 70.59 & 5.88 & 3 \\
+      \midrule
+      \texttt{FastText(negative=20,vector\_size=500)} & \multirow{3}{*}{4} & \multirow{3}{*}{250} & 77.06 & 1.50 & 1 \\
+      \texttt{Word2Vec(negative=20,vector\_size=500)} & & & 75.00 & 1.61 & 2 \\
+      \texttt{BERT(learning\_rate=2e-5,batch\_size=16)} & & & 74.26 & 2.21 & 3 \\
+      \midrule
+      \texttt{FastText(negative=20,vector\_size=500)} & \multirow{3}{*}{6} & \multirow{3}{*}{250} & 82.35 & 1.86 & 1 \\
+      \texttt{BERT(learning\_rate=2e-5,batch\_size=16)} & & & 76.32 & 3.24 & 2 \\
+      \texttt{Word2Vec(negative=20,vector\_size=500)} & & & 74.71 & 2.35 & 3 \\
+      \bottomrule
+    \end{tabular}
+  }%
+  \caption{Evaluation of the Embedding Techniques for \texttt{MUTAG} According
+    to the Maximum Depth per Walk.}
+  \label{benchmarks:embedders:mutag:depth}
+\end{table}
+
+In Table \ref{benchmarks:embedders:mutag:depth}, regardless of the maximum depth
+per walk chosen for the same number of walks per entity, FastText indicates a
+model's accuracy above Word2Vec. Specifically, FastText allows an average
+increase of the model's accuracy of 4.22 times the one given by Word2Vec. In
+addition, FastText provides an excellent model's accuracy with \texttt{MUTAG}
+for a maximum depth per walk of 6. For BERT, the latter shows better results for
+larger maximum depth per walk.
+
+\begin{figure}[!ht]
+  \centering
+  \resizebox{\textwidth}{!}{%
+  \begin{tikzpicture}
+    \begin{axis}[
+      scale only axis,
+      grid=major,
+      grid style={dashed,gray!30},
+      height=6cm,
+      width=9cm,
+      legend cell align={left},
+      legend entries={
+        \footnotesize{\texttt{BERT(learning\_rate=2e-5,batch\_size=16)}},
+        \footnotesize{\texttt{Word2Vec(negative=20,vector\_size=500)}},
+        \footnotesize{\texttt{FastText(negative=20,vector\_size=500)}}
+      },
+      legend style={
+        legend pos=outer north east,
+        font=\small
+      },
+      ylabel={Accuracy},
+      xlabel={Maximum Depth per Walk},
+      xtick={2,4,6},
+      ytick={75,77,79.70,82.30},
+      ]
+
+      \addplot[red,mark=*,error bars/.cd, y dir=both, y explicit]
+      table[x=max_depth,y=accuracy,col sep=comma] {data/embedders/max-depth/bert.csv};
+      \addplot[blue,mark=*,error bars/.cd, y dir=both, y explicit]
+      table[x=max_depth,y=accuracy,col sep=comma] {data/embedders/max-depth/word2vec.csv};
+      \addplot[green,mark=*,error bars/.cd, y dir=both, y explicit]
+      table[x=max_depth,y=accuracy,col sep=comma] {data/embedders/max-depth/fasttext.csv};
+    \end{axis}
+  \end{tikzpicture}
+  }%
+  \caption{Evaluation of the Embedding Techniques for \texttt{MUTAG} According
+    to the Maximum Depth per Walk.}
+  \label{fig:benchmarks:embedders:depth}
+\end{figure}
+
+In Figure \ref{fig:benchmarks:embedders:depth}, the curves of Word2Vec and
+FastText have an almost identical trajectory, except for a maximum depth per
+walk of 6. In this case, the accuracy model of FastText increases, while the
+accuracy model of Word2Vec decreases. Finally, BERT's accuracy is proportional
+to the maximum depth per walk. As well as the time needed to train the model.
+
+\begin{table}[!ht]
+  \centering
+  \resizebox{\textwidth}{!}{%
+    \begin{tabular}{cccS[table-format=2.2]@{${}\pm{}$}S[table-format=1.2]c}
+      \toprule
+      \textbf{Embedding Technique} & \textbf{Max. Depth} & \textbf{Max. Walks}
+      & \multicolumn{2}{c}{\textbf{Accuracy} (\SI{}{\percent})} & \textbf{Rank} \\
+      \midrule
+      \texttt{FastText(negative=20,vector\_size=500)} & \multirow{3}{*}{4} & \multirow{3}{*}{100} & 77.94 & 1.61 & 1 \\
+      \texttt{Word2Vec(negative=20,vector\_size=500)} & & & 71.47 & 2.20 & 2 \\
+      \texttt{BERT(learning\_rate=2e-5,batch\_size=16)} & & & 69.43 & 1.73 & 3 \\
+      \midrule
+      \texttt{FastText(negative=20,vector\_size=500)} & \multirow{3}{*}{4} & \multirow{3}{*}{250} & 77.35 & 3.90 & 1 \\
+      \texttt{Word2Vec(negative=20,vector\_size=500)} & & & 74.71 & 2.53 & 2 \\
+      \texttt{BERT(learning\_rate=2e-5,batch\_size=16)} & & & 73.54 & 2.28 & 3 \\
+      \midrule
+      \texttt{FastText(negative=20,vector\_size=500)} & \multirow{3}{*}{4} & \multirow{3}{*}{500} & 76.18 & 1.71 & 1 \\
+      \texttt{BERT(learning\_rate=2e-5,batch\_size=16)} & & & 75.24 & 2.37 & 2 \\
+      \texttt{Word2Vec(negative=20,vector\_size=500)} & & & 73.53 & 1.86 & 3 \\
+      \midrule
+      \texttt{FastText(negative=20,vector\_size=500)} & \multirow{3}{*}{4} & \multirow{3}{*}{1000} & 77.35 & 2.73 & 1 \\
+      \texttt{BERT(learning\_rate=2e-5,batch\_size=16)} & & & 76.58 & 1.17 & 2 \\
+      \texttt{Word2Vec(negative=20,vector\_size=500)} & & & 74.41 & 3.03 & 3 \\
+      \bottomrule
+    \end{tabular}
+  }%
+  \caption{Evaluation of the Embedding Techniques for \texttt{MUTAG} According
+    to the Maximum Number of Walks per Entity}
+  \label{benchmarks:embedders:mutag:walks}
+
+\end{table}
+
+In Table \ref{benchmarks:embedders:mutag:walks}, regardless of the number of
+walks chosen for the same maximum depth per walk, FastText indicates a model's
+accuracy above Word2Vec. Specifically, FastText allows an average increase of
+the model's accuracy of 3.675 times the one given by Word2Vec. For BERT, the
+latter shows better results for larger maximum depth per walk, but performs less
+well for smaller maximum depth per walk. For BERT, the latter indicates an
+interesting model's accuracy for 500 and 1000 walks. However, the results are
+not as exceptional for a lower maximum of walks per entity.
+
+\begin{figure}[!ht]
+  \centering
+  \resizebox{\textwidth}{!}{%
+    \begin{tikzpicture}
+      \begin{axis}[
+        scale only axis,
+        grid=major,
+        grid style={dashed,gray!30},
+        height=6cm,
+        width=9cm,
+        legend cell align={left},
+        legend entries={
+          \footnotesize{\texttt{BERT(learning\_rate=2e-5,batch\_size=16)}},
+          \footnotesize{\texttt{Word2Vec(negative=20,vector\_size=500)}},
+          \footnotesize{\texttt{FastText(negative=20,vector\_size=500)}}
+        },
+        legend style={
+          legend pos=outer north east,
+          font=\small
+        },
+        ylabel={Accuracy},
+        xlabel={Maximum Number of Walks per Entity},
+        xtick={100,250,500,1000},
+        ytick={71.50,73.50,74.50,76.20,78}
+        ]
+
+        \addplot[red,mark=*,error bars/.cd, y dir=both, y explicit]
+        table[x=max_walk,y=accuracy,col sep=comma] {data/embedders/max-walks/bert.csv};
+        \addplot[blue,mark=*,error bars/.cd, y dir=both, y explicit]
+        table[x=max_walk,y=accuracy,col sep=comma] {data/embedders/max-walks/word2vec.csv};
+        \addplot[green,mark=*,error bars/.cd, y dir=both, y explicit]
+        table[x=max_walk,y=accuracy,col sep=comma] {data/embedders/max-walks/fasttext.csv};
+      \end{axis}
+    \end{tikzpicture}
+  }%
+  \caption{Evaluation of the Embedding Techniques for \texttt{MUTAG} According
+    to the Maximum Number of Walks per Entity.}
+  \label{fig:benchmarks:embedders:walks}
+\end{figure}
+
+In Figure \ref{fig:benchmarks:embedders:walks}, the curves of Word2Vec and
+FastText still have an almost identical trajectory, except for a maximum number
+of walks per entity of 250. In this case, the accuracy model of Word2Vec
+increases, while the accuracy model of FastText decreases. Finally, BERT's
+accuracy is also proportional to the maximum number of walks per entity. As well
+as the time needed to train the model.
+
+\newpage
+
+\begin{table}[!ht]
+  \centering
+  \begin{tabular}{lc}
+    \toprule
+    \textbf{Embedding Technique} & \textbf{Average Rank} \\
+    \midrule
+    \texttt{FastText(negative=20,vector\_size=500)} & 1 \\
+    \texttt{Word2Vec(negative=20,vector\_size=500)} & 2 \\
+    \texttt{BERT(learning\_rate=2e-5,batch\_size=16)} & 3 \\
+    \bottomrule
+  \end{tabular}
+  \caption{Evaluation of the Average Rank of the Embedding Techniques for \texttt{MUTAG}.}
+  \label{tab:benchmark:embedders:average:rank}
+\end{table}
+
+In Figure \ref{tab:benchmark:embedders:average:rank}, Word2Vec is the winning
+embedding techniques in these benchmarks for \texttt{MUTAG}, followed by
+FastText, and BERT.
+
+%%% Local Variables:
+%%% mode: latex
+%%% TeX-master: "../../master-thesis"
+%%% End:
diff --git a/src/benchmarks/samplers.tex b/src/benchmarks/samplers.tex
new file mode 100644
index 0000000..2de3fae
--- /dev/null
+++ b/src/benchmarks/samplers.tex
@@ -0,0 +1,470 @@
+
+\subsection{Sampling Strategies}
+\label{subsec:samplers}
+
+To determine the accuracy impact of \texttt{WideSampler}, the latter is compared
+to other sampling strategies in a first time using \texttt{MUTAG} where only the
+number of walks and depth are variable. The number of standard entities for
+\texttt{MUTAG} being fixed at 340 trained entities, 68 of these serve as testing
+entities. As RDF2Vec is unsupervised, including the testing entities in the
+training set is not an issue.
+
+\newpage
+
+\begin{table}[!ht]
+  \centering
+  \resizebox{\textwidth}{!}{%
+    \begin{tabular}{lcc S[table-format=2.2]@{${}\pm{}$}S[table-format=1.2]c}
+      \toprule
+      \textbf{Sampler} & \textbf{Max. Depth} & \textbf{Max. Walks}
+      & \multicolumn{2}{c}{\textbf{Accuracy} (\SI{}{\percent})} & \textbf{Rank} \\
+      \midrule
+      \texttt{ObjPredFreqSampler} & \multirow{13}{*}{2} & \multirow{13}{*}{100} & \textbf{78.82} & 3.17 & 1 \\
+      \texttt{ObjFreqSampler} & & & 77.94 & 3.35 & 2 \\
+      \texttt{PredFreqSampler(inverse=True)} & & & 77.65 & 2.35 & 3 \\
+      \texttt{WideSampler} & & & 76.76 & 1.95 & 4 \\
+      \texttt{ObjPredFreqSampler(inverse=True)} & & & 76.76 & 2.16 & 5 \\
+      \texttt{PredFreqSampler} & & & 76.76 & 4.87 & 6 \\
+      \texttt{PageRankSampler(inverse=True,split=True,alpha=0.85)} & & & 76.18 & 2.16 & 7 \\
+      \texttt{PageRankSampler(alpha=0.85)} & & & 76.47 & 2.08 & 8 \\
+      \texttt{UniformSampler} & & & 76.18 & 4.50 & 9 \\
+      \texttt{ObjFreqSampler(inverse=True,split=True)} & & & 75.88 & 4.01 & 10 \\
+      \texttt{PageRankSampler(split=True,alpha=0.85)} & & & 74.71 & 1.71 & 11 \\
+      \texttt{PageRankSampler(inverse=True,alpha=0.85)} & & & 74.41 & 1.50 & 12 \\
+      \texttt{ObjFreqSampler(inverse=True)} & & & 73.53 & 2.79 & 13 \\
+      \midrule
+      \texttt{WideSampler} & \multirow{13}{*}{2} & \multirow{13}{*}{250} & \textbf{77.35} & 1.99 & 1 \\
+      \texttt{PredFreqSampler} & & & 76.18 & 3.14 & 2 \\
+      \texttt{PageRankSampler(split=True,alpha=0.85)} & & & 75.88 & 1.50 & 3 \\
+      \texttt{ObjFreqSampler(inverse=True,split=True)} & & & 75.88 & 3.03 & 4 \\
+      \texttt{PageRankSampler(inverse=True,split=True,alpha=0.85)} & & & 75.88 & 5.06 & 5 \\
+      \texttt{ObjFreqSampler(inverse=true)} & & & 75.59 & 1.99 & 6 \\
+      \texttt{ObjFreqSampler} & & & 75.59 & 4.71 & 7 \\
+      \texttt{ObjPredFreqSampler(inverse=True)} & & & 75.29 & 1.10 & 8 \\
+      \texttt{UniformSampler} & & & 75.29 & 1.44 & 9 \\
+      \texttt{ObjPredFreqSampler} & & & 75.29 & 2.35 & 10 \\
+      \texttt{PageRankSampler(inverse=True,alpha=0.85)} & & & 75.00 & 2.46 & 11 \\
+      \texttt{PageRankSampler(alpha=0.85)} & & & 74.12 & 1.99 & 12 \\
+      \texttt{PredFreqSampler(inverse=True)} & & & 73.82 & 2.53 & 13 \\
+      \midrule
+      \texttt{ObjFreqSampler} & \multirow{13}{*}{2} & \multirow{13}{*}{500} & \textbf{75.29} & 2.16 & 1 \\
+      \texttt{PageRankSampler(alpha=0.85)} & & & 75.00 & 0.00 & 2 \\
+      \texttt{WideSampler} & & & 74.71 & 1.10 & 3 \\
+      \texttt{ObjFreqSampler(inverse=True,split=True)} & & & 74.71 & 3.14 & 4 \\
+      \texttt{PageRankSampler(inverse=True,alpha=0.85)} & & & 74.12 & 2.56 & 5 \\
+      \texttt{ObjPredFreqSampler(inverse=True)} & & & 73.82 & 2.53 & 6 \\
+      \texttt{PageRankSampler(inverse=True,split=True,alpha=0.85)} & & & 73.82 & 2.16 & 7 \\
+      \texttt{PageRankSampler(split=True,alpha=0.85)} & & & 73.82 & 2.53 & 8 \\
+      \texttt{ObjPredFreqSampler} & & & 73.53 & 1.32 & 9 \\
+      \texttt{UniformSampler} & & & 73.53 & 2.63 & 10 \\
+      \texttt{PredFreqSampler} & & & 72.65 & 2.73 & 11 \\
+      \texttt{ObjFreqSampler(inverse=True)} & & & 72.65 & 2.73 & 12 \\
+      \texttt{PredFreqSampler(inverse=True)} & & & 72.35 & 1.95 & 13 \\
+      \bottomrule
+    \end{tabular}
+  }%
+  \caption{Accuracy of Sampling Strategies for \texttt{MUTAG} (Part I).}
+  \label{tab:benchmarks:samplers:part:1}
+\end{table}
+
+In Table \ref{tab:benchmarks:samplers:part:1}, for a maximum depth of walk of 2 with
+a maximum number of walks of 250, \texttt{WideSampler} indicates the best
+model's accuracy. In addition, the latter gives good precision models for a
+maximum number of walks of 100 and 500.
+
+ \begin{figure}[!ht]
+  \centering
+  \resizebox{0.9\textwidth}{!}{%
+    \begin{tikzpicture}
+      \begin{axis}[
+        scale only axis,
+        grid=major,
+        grid style={dashed,gray!30},
+        height=6cm,
+        width=9cm,
+        legend cell align={left},
+        legend entries={
+          \texttt{ObjFreqSampler(inverse=True,split=True)},
+          \texttt{ObjFreqSampler(inverse=True)},
+          \texttt{ObjFreqSampler},
+          \texttt{ObjPredFreqSampler(inverse=True)},
+          \texttt{ObjPredFreqSampler},
+          \texttt{PageRankSampler(inverse=True,split=True,alpha=0.85)},
+          \texttt{PageRankSampler(inverse=True,alpha=0.85)},
+          \texttt{PageRankSampler(split=True,alpha=0.85)},
+          \texttt{PageRankSampler(alpha=0.85)},
+          \texttt{PredFreqSampler(inverse=True)},
+          \texttt{PredFreqSampler},
+          \texttt{UniformSampler},
+          \texttt{WideSampler},
+        },
+        legend style={
+          legend pos=outer north east,
+          font=\small
+        },
+        ylabel={Accuracy},
+        xlabel={Maximum Number of Walks per Entity},
+        xtick={100,250,500},
+        ytick={73,74,76,78},
+        ]
+
+        \addplot[medimumBlue,mark=diamond,error bars/.cd, y dir=both, y explicit]
+        table[x=max_walk,y=accuracy,col sep=comma] {data/samplers/max-depth/2/objfreq-inv-split.csv};
+        \addplot[darkBlue,mark=square,error bars/.cd, y dir=both, y explicit]
+        table[x=max_walk,y=accuracy,col sep=comma] {data/samplers/max-depth/2/objfreq-inv.csv};
+        \addplot[blue,mark=*,error bars/.cd, y dir=both, y explicit]
+        table[x=max_walk,y=accuracy,col sep=comma] {data/samplers/max-depth/2/objfreq.csv};
+
+        \addplot[darkRed,mark=square,error bars/.cd, y dir=both, y explicit]
+        table[x=max_walk,y=accuracy,col sep=comma] {data/samplers/max-depth/2/objpredfreq-inv.csv};
+        \addplot[red,mark=*,error bars/.cd, y dir=both, y explicit]
+        table[x=max_walk,y=accuracy,col sep=comma] {data/samplers/max-depth/2/objpredfreq.csv};
+
+        \addplot[mediumGreen,mark=diamond,error bars/.cd, y dir=both, y explicit]
+        table[x=max_walk,y=accuracy,col sep=comma] {data/samplers/max-depth/2/pagerank-inv-split.csv};
+        \addplot[darkGreen,mark=square,error bars/.cd, y dir=both, y explicit]
+        table[x=max_walk,y=accuracy,col sep=comma] {data/samplers/max-depth/2/pagerank-inv.csv};
+        \addplot[yellow,mark=triangle,error bars/.cd, y dir=both, y explicit]
+        table[x=max_walk,y=accuracy,col sep=comma]{data/samplers/max-depth/2/pagerank-split.csv};
+        \addplot[green,mark=*,error bars/.cd, y dir=both, y explicit]
+        table[x=max_walk,y=accuracy,col sep=comma] {data/samplers/max-depth/2/pagerank.csv};
+
+        \addplot[darkPurple,mark=square,error bars/.cd, y dir=both, y explicit]
+        table[x=max_walk,y=accuracy,col sep=comma] {data/samplers/max-depth/2/predfreq-inv.csv};
+        \addplot[purple,mark=*,error bars/.cd, y dir=both, y explicit]
+        table[x=max_walk,y=accuracy,col sep=comma] {data/samplers/max-depth/2/predfreq.csv};
+
+        \addplot[brown,mark=*,error bars/.cd, y dir=both, y explicit]
+        table[x=max_walk,y=accuracy,col sep=comma] {data/samplers/max-depth/2/uniform.csv};
+        \addplot[black,mark=*,error bars/.cd, y dir=both, y explicit]
+        table[x=max_walk,y=accuracy,col sep=comma] {data/samplers/max-depth/2/widesampler.csv};
+      \end{axis}
+    \end{tikzpicture}
+  }%
+  \caption{Sampling Strategies for \texttt{MUTAG} According
+    to a Maximum Depth per Walk of 2.}
+  \label{fig:benchmarks:samplers:part:1}
+\end{figure}
+
+In Figure \ref{fig:benchmarks:samplers:part:1}, the model's accuracy for the
+different walking strategies is inversely proportional to the maximum number of
+walks per entity. Moreover, \texttt{WideSampler} and \texttt{PredFreqSampler}
+have two almost similar curves by their trajectory.
+
+\begin{table}[!ht]
+  \centering
+  \resizebox{0.94\textwidth}{!}{%
+    \begin{tabular}{lccS[table-format=2.2]@{${}\pm{}$}S[table-format=1.2]c}
+      \toprule
+      \textbf{Sampler} & \textbf{Max. Depth} & \textbf{Max. Walks}
+      & \multicolumn{2}{c}{\textbf{Accuracy} (\SI{}{\percent})} & \textbf{Rank} \\
+      \midrule
+      \texttt{WideSampler} & \multirow{13}{*}{4} & \multirow{13}{*}{100} & \textbf{78.82} & 3.03 & 1 \\
+      \texttt{PredFreqSampler(inverse=True)} & & & 78.24 & 2.53 & 2 \\
+      \texttt{ObjPredFreqSampler} & & & 75.29 & 3.99 & 3 \\
+      \texttt{PageRankSampler(inverse=True,alpha=0.85)} & & & 75.00 & 2.63 & 4 \\
+      \texttt{PageRankSampler(split=True,alpha=0.85)} & & & 74.12 & 3.79 & 5 \\
+      \texttt{ObjFreqSampler(inverse=True)} & & & 73.82 & 2.16 & 6 \\
+      \texttt{PredFreqSampler} & & & 73.82 & 2.53 & 7 \\
+      \texttt{UniformSampler} & & & 73.24 & 1.95 & 8 \\
+      \texttt{ObjPredFreqSampler(inverse=True)} & & & 72.94 & 1.50 & 9 \\
+      \texttt{PageRankSampler(inverse=True,split=True,alpha=0.85)} & & & 72.65 & 1.18 & 10 \\
+      \texttt{ObjFreqSampler(inverse=True,split=True)} & & & 72.65 & 3.79 & 11 \\
+      \texttt{PageRankSampler(alpha=0.85)} & & & 72.65 & 4.22 & 12 \\
+      \texttt{ObjFreqSampler} & & & 69.12 & 5.73 & 13 \\
+      \midrule
+      \texttt{ObjPredFreqSampler(inverse=True)} & \multirow{13}{*}{4} & \multirow{13}{*}{250} & \textbf{79.41} & 2.63 & 1 \\
+      \texttt{ObjFreqSampler} & & & 78.53 & 2.56 & 2 \\
+      \texttt{PageRankSampler(inverse=True,alpha=0.85)} & & & 77.06 & 2.56 & 3 \\
+      \texttt{ObjFreqSampler(inverse=True)} & & & 76.18 & 3.14 & 4 \\
+      \texttt{PageRankSampler(inverse=True,split=True,alpha=0.85)} & & & 75.59 & 2.88 & 5 \\
+      \texttt{ObjFreqSampler(inverse=True,split=True)} & & & 75.59 & 1.76 & 6 \\
+      \texttt{WideSampler} & & & 75.00 & 2.94 & 7 \\
+      \texttt{PredFreqSampler(inverse=True)} & & & 74.71 & 2.85 & 8 \\
+      \texttt{PredFreqSampler} & & & 74.41 & 2.73 & 9 \\
+      \texttt{UniformSampler} & & & 74.41 & 2.56 & 10 \\
+      \texttt{PageRankSampler(alpha=0.85)} & & & 73.53 & 2.28 & 11 \\
+      \texttt{PageRankSampler(split=True,alpha=0.85)} & & & 71.18 & 2.39 & 12 \\
+      \texttt{ObjPredFreqSampler} & & & 68.82 & 3.53 & 13 \\
+      \midrule
+      \texttt{ObjPredFreqSampler(inverse=True)} & \multirow{13}{*}{4} & \multirow{13}{*}{500} & \textbf{77.94} & 0.93 & 1 \\
+      \texttt{ObjFreqSampler(inverse=True)} & & & 77.65 & 1.95 & 2 \\
+      \texttt{UniformSampler} & & & 77.35 & 1.18 & 3 \\
+      \texttt{PageRankSampler(split=True,alpha=0.85)} & & & 77.35 & 2.73 & 4 \\
+      \texttt{ObjPredFreqSampler} & & & 77.06 & 2.39 & 5 \\
+      \texttt{ObjFreqSampler(inverse=True,split=True)} & & & 76.18 & 1.10 & 6 \\
+      \texttt{PredFreqSampler} & & & 75.59 & 4.32 & 7 \\
+      \texttt{PageRankSampler(alpha=0.85)} & & & 75.59 & 3.03 & 8 \\
+      \texttt{PageRankSampler(inverse=True,split=True,alpha=0.85)} & & & 75.29 & 2.16 & 9 \\
+      \texttt{PageRankSampler(inverse=True,alpha=0.85)} & & & 75.29 & 3.14 & 10 \\
+      \texttt{WideSampler} & & & 74.41 & 2.20 & 11 \\
+      \texttt{ObjFreqSampler} & & & 74.41 & 2.20 & 11 \\
+      \texttt{PredFreqSampler(inverse=True)} & & & 74.12 & 4.12 & 12 \\
+      \bottomrule
+    \end{tabular}
+    }%
+    \caption{Accuracy of Sampling Strategies for \texttt{MUTAG} (Part II).}
+  \label{tab:benchmarks:samplers:part:2}
+\end{table}
+
+In Table \ref{tab:benchmarks:samplers:part:2}, for a maximum depth per walk of 4
+with a maximum number of walks of 100, \texttt{WideSampler} indicates the best
+model's accuracy. However, the model's accuracy of \texttt{WideSampler} is
+inversely proportional to the maximum number of walks per entity.
+
+\newpage
+
+ \begin{figure}[!ht]
+  \centering
+  \resizebox{\textwidth}{!}{%
+    \begin{tikzpicture}
+      \begin{axis}[
+        scale only axis,
+        grid=major,
+        grid style={dashed,gray!30},
+        height=6cm,
+        width=9cm,
+        legend cell align={left},
+        legend entries={
+          \texttt{ObjFreqSampler(inverse=True,split=True)},
+          \texttt{ObjFreqSampler(inverse=True)},
+          \texttt{ObjFreqSampler},
+          \texttt{ObjPredFreqSampler(inverse=True)},
+          \texttt{ObjPredFreqSampler},
+          \texttt{PageRankSampler(inverse=True,split=True,alpha=0.85)},
+          \texttt{PageRankSampler(inverse=True,alpha=0.85)},
+          \texttt{PageRankSampler(split=True,alpha=0.85)},
+          \texttt{PageRankSampler(alpha=0.85)},
+          \texttt{PredFreqSampler(inverse=True)},
+          \texttt{PredFreqSampler},
+          \texttt{UniformSampler},
+          \texttt{WideSampler},
+        },
+        legend style={
+          legend pos=outer north east,
+          font=\small
+        },
+        ylabel={Accuracy},
+        xlabel={Maximum Number of Walks per Entity},
+        xtick={100,250,500},
+        ytick={70,72,74,76,78}
+        ]
+
+        \addplot[medimumBlue,mark=diamond,error bars/.cd, y dir=both, y explicit]
+        table[x=max_walk,y=accuracy,col sep=comma] {data/samplers/max-depth/4/objfreq-inv-split.csv};
+        \addplot[darkBlue,mark=square,error bars/.cd, y dir=both, y explicit]
+        table[x=max_walk,y=accuracy,col sep=comma] {data/samplers/max-depth/4/objfreq-inv.csv};
+        \addplot[blue,mark=*,error bars/.cd, y dir=both, y explicit]
+        table[x=max_walk,y=accuracy,col sep=comma] {data/samplers/max-depth/4/objfreq.csv};
+
+        \addplot[darkRed,mark=square,error bars/.cd, y dir=both, y explicit]
+        table[x=max_walk,y=accuracy,col sep=comma] {data/samplers/max-depth/4/objpredfreq-inv.csv};
+        \addplot[red,mark=*,error bars/.cd, y dir=both, y explicit]
+        table[x=max_walk,y=accuracy,col sep=comma] {data/samplers/max-depth/4/objpredfreq.csv};
+
+        \addplot[mediumGreen,mark=diamond,error bars/.cd, y dir=both, y explicit]
+        table[x=max_walk,y=accuracy,col sep=comma] {data/samplers/max-depth/4/pagerank-inv-split.csv};
+        \addplot[darkGreen,mark=square,error bars/.cd, y dir=both, y explicit]
+        table[x=max_walk,y=accuracy,col sep=comma] {data/samplers/max-depth/4/pagerank-inv.csv};
+        \addplot[yellow,mark=triangle,error bars/.cd, y dir=both, y explicit]
+        table[x=max_walk,y=accuracy,col sep=comma]{data/samplers/max-depth/4/pagerank-split.csv};
+        \addplot[green,mark=*,error bars/.cd, y dir=both, y explicit]
+        table[x=max_walk,y=accuracy,col sep=comma] {data/samplers/max-depth/4/pagerank.csv};
+
+        \addplot[darkPurple,mark=square,error bars/.cd, y dir=both, y explicit]
+        table[x=max_walk,y=accuracy,col sep=comma] {data/samplers/max-depth/4/predfreq-inv.csv};
+        \addplot[purple,mark=*,error bars/.cd, y dir=both, y explicit]
+        table[x=max_walk,y=accuracy,col sep=comma] {data/samplers/max-depth/4/predfreq.csv};
+
+        \addplot[brown,mark=*,error bars/.cd, y dir=both, y explicit]
+        table[x=max_walk,y=accuracy,col sep=comma] {data/samplers/max-depth/4/uniform.csv};
+        \addplot[black,mark=*,error bars/.cd, y dir=both, y explicit]
+        table[x=max_walk,y=accuracy,col sep=comma] {data/samplers/max-depth/4/widesampler.csv};
+      \end{axis}
+    \end{tikzpicture}
+  }%
+  \caption{Sampling Strategies for \texttt{MUTAG} According
+    to a Maximum Depth per Walk of 4.}
+  \label{fig:benchmarks:samplers:part:2}
+\end{figure}
+
+In Figure \ref{fig:benchmarks:samplers:part:2}, the model's accuracy for most of
+the walking strategies is proportional to the maximum number of walks per
+entity.
+
+\begin{table}[!ht]
+  \centering
+  \resizebox{0.85\textwidth}{!}{%
+    \begin{tabular}{lcc S[table-format=2.2]@{${}\pm{}$}S[table-format=1.2]c}
+      \toprule
+      \textbf{Sampler} & \textbf{Max. Depth} & \textbf{Max. Walks}
+      & \multicolumn{2}{c}{\textbf{Accuracy} (\SI{}{\percent})} & \textbf{Rank} \\
+      \midrule
+      \texttt{PredFreqSampler} & \multirow{13}{*}{6} & \multirow{13}{*}{100} & \textbf{79.41} & 2.46 & 1 \\
+      \texttt{PageRankSampler(split=True,alpha=0.85)} & & & 77.06 & 3.03 & 2 \\
+      \texttt{ObjFreqSampler(inverse=True)} & & & 76.76 & 3.40 & 3 \\
+      \texttt{WideSampler} & & & 76.76 & 3.40 & 3 \\
+      \texttt{PageRankSampler(inverse=True,alpha=0.85)} & & & 76.76 & 3.88 & 4 \\
+      \texttt{ObjFreqSampler} & & & 75.59 & 5.06 & 5 \\
+      \texttt{ObjPredFreqSampler(inverse=True)} & & & 75.59 & 2.73 & 6 \\
+      \texttt{PageRankSampler(inverse=True,split=True,alpha=0.85)} & & & 75.29 & 1.71 & 7 \\
+      \texttt{PredFreqSampler(inverse=True)} & & & 75.29 & 1.71 & 8 \\
+      \texttt{ObjFreqSampler(inverse=True,split=True)} & & & 74.12 & 3.03 & 9 \\
+      \texttt{PageRankSampler(alpha=0.85)} & & & 73.53 & 3.35 & 10 \\
+      \texttt{UniformSampler} & & & 73.24 & 3.77 & 11 \\
+      \texttt{ObjPredFreqSampler} & & & 70.88 & 6.47 & 12 \\
+      \midrule
+      \texttt{PageRankSampler(inverse=True,alpha=0.85)} & \multirow{13}{*}{6} & \multirow{13}{*}{250} & \textbf{80.00} & 1.50 & 1 \\
+      \texttt{ObjFreqSampler(inverse=True,split=True)} & & & 78.53 & 3.17 & 2 \\
+      \texttt{PredFreqSampler(inverse=True)} & & & 78.24 & 4.30 & 3 \\
+      \texttt{ObjPredFreqSampler(inverse=True)} & & & 78.24 & 6.20 & 4 \\
+      \texttt{PageRankSampler(alpha=0.85)} & & & 77.35 & 2.39 & 5 \\
+      \texttt{PageRankSampler(inverse=True,split=True,alpha=0.85)} & & & 76.18 & 2.53 & 6 \\
+      \texttt{ObjFreqSampler(inverse=True)} & & & 76.18 & 4.30 & 7 \\
+      \texttt{UniformSampler} & & & 75.59 & 1.76 & 8 \\
+      \texttt{PredFreqSampler} & & & 75.59 & 3.03 & 9 \\
+      \texttt{ObjFreqSampler} & & & 75.00 & 2.08 & 10 \\
+      \texttt{WideSampler} & & & 75.00 & 4.83 & 11 \\
+      \texttt{ObjPredFreqSampler} & & & 73.24 & 4.78 & 12 \\
+      \texttt{PageRankSampler(split=True,alpha=0.85)} & & & 72.65 & 2.56 & 13 \\
+      \midrule
+      \texttt{PredFreqSampler} & \multirow{13}{*}{6} & \multirow{13}{*}{500} & \textbf{79.41} & 3.08 & 1 \\
+      \texttt{ObjFreqSampler(inverse=True)} & & & 78.82 & 1.50 & 2 \\
+      \texttt{WideSampler} & & & 78.53 & 1.18 & 3 \\
+      \texttt{ObjFreqSampler} & & & 78.24 & 1.95 & 4 \\
+      \texttt{PageRankSampler(inverse=True,split=True,alpha=0.85)} & & & 77.94 & 1.32 & 5 \\
+      \texttt{UniformSampler} & & & 77.94 & 1.86 & 6 \\
+      \texttt{PredFreqSampler(inverse=True)} & & & 77.35 & 1.50 & 7 \\
+      \texttt{PageRankSampler(alpha=0.85)} & & & 77.35 & 1.50 & 7 \\
+      \texttt{PageRankSampler(inverse=True,alpha=0.85)} & & & 77.35 & 1.99 & 8 \\
+      \texttt{ObjPredFreqSampler} & & & 77.06 & 1.99 & 9 \\
+      \texttt{PageRankSampler(split=True,alpha=0.85)} & & & 76.76 & 2.16 & 10 \\
+      \texttt{ObjFreqSampler(inverse=True,split=True)} & & & 75.88 & 1.76 & 11 \\
+      \texttt{ObjPredFreqSampler(inverse=True)} & & & 75.59 & 3.55 & 12 \\
+      \bottomrule
+    \end{tabular}
+    }%
+    \caption{Accuracy of Sampling Strategies for MUTAG (Part III).}
+    \label{tab:benchmark:samplers:walks:part:3}
+\end{table}
+
+In Table \ref{tab:benchmark:samplers:walks:part:3}, for a maximum depth of walk of 6 with
+a maximum number of walks of 500, \texttt{WideSampler} indicates a good
+model's accuracy. However, the latter indicates a poor model's accuracy for a
+maximum number of walks of 500.
+
+\newpage
+
+\begin{figure}[!ht]
+  \centering
+  \resizebox{\textwidth}{!}{%
+    \begin{tikzpicture}
+      \begin{axis}[
+        scale only axis,
+        grid=major,
+        grid style={dashed,gray!30},
+        height=6cm,
+        width=9cm,
+        legend cell align={left},
+        legend entries={
+          \texttt{ObjFreqSampler(inverse=True,split=True)},
+          \texttt{ObjFreqSampler(inverse=True)},
+          \texttt{ObjFreqSampler},
+          \texttt{ObjPredFreqSampler(inverse=True)},
+          \texttt{ObjPredFreqSampler},
+          \texttt{PageRankSampler(inverse=True,split=True,alpha=0.85)},
+          \texttt{PageRankSampler(inverse=True,alpha=0.85)},
+          \texttt{PageRankSampler(split=True,alpha=0.85)},
+          \texttt{PageRankSampler(alpha=0.85)},
+          \texttt{PredFreqSampler(inverse=True)},
+          \texttt{PredFreqSampler},
+          \texttt{UniformSampler},
+          \texttt{WideSampler},
+        },
+        legend style={
+          legend pos=outer north east,
+          font=\small
+        },
+        ylabel={Accuracy},
+        xlabel={Maximum Number of Walks per Entity},
+        xtick={100,250,500},
+        ytick={72,74,76,78,80}
+        ]
+
+        \addplot[medimumBlue,mark=diamond,error bars/.cd, y dir=both, y explicit]
+        table[x=max_walk,y=accuracy,col sep=comma] {data/samplers/max-depth/6/objfreq-inv-split.csv};
+        \addplot[darkBlue,mark=square,error bars/.cd, y dir=both, y explicit]
+        table[x=max_walk,y=accuracy,col sep=comma] {data/samplers/max-depth/6/objfreq-inv.csv};
+        \addplot[blue,mark=*,error bars/.cd, y dir=both, y explicit]
+        table[x=max_walk,y=accuracy,col sep=comma] {data/samplers/max-depth/6/objfreq.csv};
+
+        \addplot[darkRed,mark=square,error bars/.cd, y dir=both, y explicit]
+        table[x=max_walk,y=accuracy,col sep=comma] {data/samplers/max-depth/6/objpredfreq-inv.csv};
+        \addplot[red,mark=*,error bars/.cd, y dir=both, y explicit]
+        table[x=max_walk,y=accuracy,col sep=comma] {data/samplers/max-depth/6/objpredfreq.csv};
+
+        \addplot[mediumGreen,mark=diamond,error bars/.cd, y dir=both, y explicit]
+        table[x=max_walk,y=accuracy,col sep=comma] {data/samplers/max-depth/6/pagerank-inv-split.csv};
+        \addplot[darkGreen,mark=square,error bars/.cd, y dir=both, y explicit]
+        table[x=max_walk,y=accuracy,col sep=comma] {data/samplers/max-depth/6/pagerank-inv.csv};
+        \addplot[yellow,mark=triangle,error bars/.cd, y dir=both, y explicit]
+        table[x=max_walk,y=accuracy,col sep=comma]{data/samplers/max-depth/6/pagerank-split.csv};
+        \addplot[green,mark=*,error bars/.cd, y dir=both, y explicit]
+        table[x=max_walk,y=accuracy,col sep=comma] {data/samplers/max-depth/6/pagerank.csv};
+
+        \addplot[darkPurple,mark=square,error bars/.cd, y dir=both, y explicit]
+        table[x=max_walk,y=accuracy,col sep=comma] {data/samplers/max-depth/6/predfreq-inv.csv};
+        \addplot[purple,mark=*,error bars/.cd, y dir=both, y explicit]
+        table[x=max_walk,y=accuracy,col sep=comma] {data/samplers/max-depth/6/predfreq.csv};
+
+        \addplot[brown,mark=*,error bars/.cd, y dir=both, y explicit]
+        table[x=max_walk,y=accuracy,col sep=comma] {data/samplers/max-depth/6/uniform.csv};
+        \addplot[black,mark=*,error bars/.cd, y dir=both, y explicit]
+        table[x=max_walk,y=accuracy,col sep=comma] {data/samplers/max-depth/6/widesampler.csv};
+      \end{axis}
+    \end{tikzpicture}
+  }%
+  \caption{Sampling Strategies for \texttt{MUTAG} According
+    to a Maximum Depth per Walk of 6.}
+  \label{fig:benchmarks:samplers:part:3}
+\end{figure}
+
+In Figure \ref{fig:benchmarks:samplers:part:3}, the curve of
+\texttt{WideSampler} shows the same trajectory as \texttt{PredFreqSampler} and
+\texttt{PageRankSampler(inverse=True,alpha=0.85)}. In addition,
+\texttt{ObjPreqFreqSampler} is proportional to the maximum number of walks per
+entity.
+
+\begin{table}[!ht]
+  \centering
+  \begin{tabular}{lc}
+    \toprule
+    \textbf{Sampler} & \textbf{Average Rank} \\
+    \midrule
+    \texttt{WideSampler} & 1 \\
+    \texttt{ObjPredFreqSampler(inverse=True)} & 2 \\
+    \texttt{PredFreqSampler} & 3 \\
+    \texttt{ObjFreqSampler} & 4 \\
+    \texttt{ObjFreqSampler(inverse=True)} & 5 \\
+    \texttt{PageRankSampler(inverse=True,alpha=0.85)} & 6 \\
+    \texttt{PageRankSampler(inverse=True,split=True,alpha=0.85)} & 7 \\
+    \texttt{ObjFreqSampler(inverse=True,split=True)} & 8 \\
+    \texttt{PageRankSampler(split=True,alpha=0.85)} & 9 \\
+    \texttt{PredFreqSampler(inverse=True)} & 10 \\
+    \texttt{ObjPredFreqSampler} & 11 \\
+    \texttt{UniformSampler} & 11 \\
+    \texttt{PageRankSampler(alpha=0.85)} & 12 \\
+    \bottomrule
+  \end{tabular}
+  \caption{Average Rank of the Sampling Strategies.}
+  \label{benchmark:samplers:average:rank}
+\end{table}
+
+In Figure \ref{benchmark:samplers:average:rank}, \texttt{WideSampler} is the winning
+sampler strategy in these benchmarks for \texttt{MUTAG}, followed by
+\texttt{ObjPredFreqSampler(inverse=True)}, and \texttt{PredFreqSampler}.
+
+%%% Local Variables:
+%%% mode: latex
+%%% TeX-master: "../../master-thesis"
+%%% End:
diff --git a/src/benchmarks/walkers.tex b/src/benchmarks/walkers.tex
new file mode 100644
index 0000000..225b025
--- /dev/null
+++ b/src/benchmarks/walkers.tex
@@ -0,0 +1,236 @@
+
+\subsection{Walking Strategies}
+\label{subsec:walkers}
+
+Each walking strategy is evaluated using \texttt{UniformSampler} as sampling strategy and
+Word2Vec as embedding technique. For these benchmarks, Word2Vec keeps the same
+hyperparameters as given in Section \ref{subsec:embedding:techniques}, namely
+ten epochs, twenty negative words and a vector size of 500.
+
+\begin{table}[!ht]
+  \centering
+  \resizebox{\textwidth}{!}{%
+    \begin{tabular}{lccS[table-format=2.2]@{${}\pm{}$}S[table-format=1.2]c}
+      \toprule
+      \textbf{Walker} & \textbf{Max. Depth} & \textbf{Max. Walks}
+      & \multicolumn{2}{c}{\textbf{Accuracy} (\SI{}{\percent})} & \textbf{Rank} \\
+      \midrule
+      \texttt{RandomWalker} & \multirow{6}{*}{2} & \multirow{6}{*}{250} & \textbf{77.94} & 2.08 & 1 \\
+      \texttt{NGramWalker(grams=3)} & & & 76.47 & 1.32 & 2 \\
+      \texttt{HALKWalker(freq\_threshold=0.01)} & & & 75.59 & 2.39 & 3 \\
+      \texttt{SplitWalker} & & & 74.71 & 2.53 & 4 \\
+      \texttt{WalkletWalker} & & & 72.06 & 1.32 & 5 \\
+      \texttt{AnonymousWalker} & & & 65.29 & 1.76 & 6 \\
+      \midrule
+      \texttt{HALKWalker(freq\_threshold=0.01)} & \multirow{6}{*}{4} & \multirow{6}{*}{250} & \textbf{78.82} & 1.50 & 1 \\
+      \texttt{SplitWalker} & & & 77.35 & 4.01 & 2 \\
+      \texttt{RandomWalker} & & & 76.76 & 6.06 & 3 \\
+      \texttt{NGramWalker(grams=3)} & & & 75.88 & 3.90 & 4 \\
+      \texttt{WalkletWalker} & & & 73.82 & 1.95 & 5 \\
+      \texttt{AnonymousWalker} & & & 66.47 & 1.44 & 6 \\
+      \midrule
+      \texttt{HALKWalker(freq\_threshold=0.01)} & \multirow{6}{*}{6} & \multirow{6}{*}{250} & \textbf{81.18} & 4.87 & 1 \\
+      \texttt{SplitWalker} & & & 79.71 & 2.16 & 2 \\
+      \texttt{NGramWalker(grams=3)} & & & 77.65 & 1.95 & 3 \\
+      \texttt{RandomWalker} & & & 75.29 & 2.16 & 4 \\
+      \texttt{WalkletWalker} & & & 71.76 & 1.10 & 5 \\
+      \texttt{AnonymousWalker} & & & 67.65 & 1.86 & 6 \\
+      \bottomrule
+    \end{tabular}
+    }%
+  \caption{Evaluation of the Accuracy of Walking Strategies for \texttt{MUTAG} According
+    to the Maximum Depth per Walk.}
+  \label{benchmarks:walkers:mutag:depth}
+\end{table}
+
+In Table \ref{benchmarks:walkers:mutag:depth}, regardless of the maximum depth
+per walk chosen for the same number of walks per entity, \texttt{HALKWalker}
+indicates a model's accuracy above \texttt{SplitWalker}. However, these two
+walking strategies provide better results for more significant maximum depths per
+walk. While \texttt{RandomWalker} indicates a good model's accuracy for small maximum
+depth per walk.
+
+\newpage
+
+\begin{figure}[!ht]
+  \centering
+  \resizebox{\textwidth}{!}{%
+  \begin{tikzpicture}
+    \begin{axis}[
+      scale only axis,
+      grid=major,
+      grid style={dashed,gray!30},
+      height=6cm,
+      width=9cm,
+      legend cell align={left},
+      legend entries={
+        \texttt{AnonymousWalker},
+        \texttt{HALKWalker(freq\_threshold=0.01)},
+        \texttt{NGramWalker(grams=3)},
+        \texttt{RandomWalker},
+        \texttt{SplitWalker},
+        \texttt{WalkletWalker},
+      },
+      legend style={
+        legend pos=outer north east,
+        font=\small
+      },
+      ylabel={Accuracy},
+      xlabel={Maximum Depth per Walk},
+      xtick={2, 4, 6},
+      ytick={65.30,67,71.80,73.80,76,78,79.70,81.20},
+      ]
+
+      \addplot[brown,mark=*,error bars/.cd, y dir=both, y explicit]
+      table[x=max_depth,y=accuracy,col sep=comma] {data/walkers/max-depth/anonymous.csv};
+      \addplot[yellow,mark=*,error bars/.cd, y dir=both, y explicit]
+      table[x=max_depth,y=accuracy,col sep=comma] {data/walkers/max-depth/halk.csv};
+      \addplot[blue,mark=*,error bars/.cd, y dir=both, y explicit]
+      table[x=max_depth,y=accuracy,col sep=comma] {data/walkers/max-depth/ngram.csv};
+      \addplot[green,mark=*,error bars/.cd, y dir=both, y explicit]
+      table[x=max_depth,y=accuracy,col sep=comma] {data/walkers/max-depth/random.csv};
+      \addplot[red,mark=*,error bars/.cd, y dir=both, y explicit]
+      table[x=max_depth,y=accuracy,col sep=comma] {data/walkers/max-depth/split.csv};
+      \addplot[darkPurple,mark=*,error bars/.cd, y dir=both, y explicit]
+      table[x=max_depth,y=accuracy,col sep=comma] {data/walkers/max-depth/walklet.csv};
+    \end{axis}
+  \end{tikzpicture}
+  }%
+  \caption{Evaluation of the Accuracy of Walking Strategies for MUTAG According to the Maximum Depth per Walk.}
+  \label{fig:benchmarks:walkers:depth}
+\end{figure}
+
+In Figure \ref{fig:benchmarks:walkers:depth}, the curve of \texttt{SplitWalker}
+and \texttt{HALKWalker} have an identical trajectory, except that
+\texttt{HALKWalker} indicates better model's accuracy than
+\texttt{SplitWalker}. In addition, they return better model's accuracy than the
+other walking strategies.
+
+\begin{table}[!ht]
+  \centering
+  \resizebox{\textwidth}{!}{%
+    \begin{tabular}{lccS[table-format=2.2]@{${}\pm{}$}S[table-format=1.2]c}
+      \toprule
+      \textbf{Walker} & \textbf{Max. Depth} & \textbf{Max. Walks}
+      & \multicolumn{2}{c}{\textbf{Accuracy} (\SI{}{\percent})} & \textbf{Rank} \\
+      \midrule
+      \texttt{SplitWalker} & \multirow{6}{*}{4} & \multirow{6}{*}{100} & \textbf{79.12} & 4.20 & 1 \\
+      \texttt{HALKWalker(freq\_threshold=0.01)} & & & 77.65 & 2.53 & 2 \\
+      \texttt{WalkletWalker} & & & 73.82 & 1.95 & 3 \\
+      \texttt{RandomWalker} & & & 72.35 & 3.77 & 4 \\
+      \texttt{NGramWalker(grams=3)} & & & 68.82 & 3.99 & 5 \\
+      \texttt{AnonymousWalker} & & & 65.59 & 1.18 & 6 \\
+      \midrule
+      \texttt{SplitWalker} & \multirow{6}{*}{4} & \multirow{6}{*}{250} & \textbf{77.35} & 2.56 & 1 \\
+      \texttt{HALKWalker(freq\_threshold=0.01)} & & & 76.76 & 3.65 & 2 \\
+      \texttt{RandomWalker} & & & 76.18 & 1.71 & 3 \\
+      \texttt{NGramWalker(grams=3)} & & & 73.24 & 3.40 & 4 \\
+      \texttt{WalkletWalker} & & & 71.76 & 1.10 & 5 \\
+      \texttt{AnonymousWalker} & & & 66.76 & 1.18 & 6 \\
+      \midrule
+      \texttt{HALKWalker(freq\_threshold=0.01)} & \multirow{6}{*}{4} & \multirow{6}{*}{500} & \textbf{79.12} & 3.99 & 1 \\
+      \texttt{SplitWalker} & & & 77.35 & 1.50 & 2 \\
+      \texttt{WalkletWalker} & & & 77.06 & 3.55 & 3 \\
+      \texttt{RandomWalker} & & & 72.06 & 1.32 & 4 \\
+      \texttt{NGramWalker(grams=3)} & & & 71.76 & 1.10 & 5 \\
+      \texttt{AnonymousWalker} & & & 65.80 & 1.44 & 6 \\
+      \bottomrule
+    \end{tabular}
+    }%
+  \caption{Evaluation of the Accuracy of Walking Strategies for \texttt{MUTAG} According
+    to the Maximum Number of Walks per Entity}
+  \label{tab:benchmarks:walkers:mutag:walks}
+\end{table}
+
+In Table \ref{tab:benchmarks:walkers:mutag:walks}, regardless of the number of
+walks chosen for the same maximum depth per walk, \texttt{SplitWalker} indicates
+a correct model's accuracy above the average of walking
+strategies. Specifically, \texttt{SplitWalker} allows an average model's
+accuracy of \SI{78.53}{\percent}.
+
+\newpage
+
+\begin{figure}[!ht]
+  \centering
+  \resizebox{\textwidth}{!}{%
+  \begin{tikzpicture}
+    \begin{axis}[
+      scale only axis,
+      grid=major,
+      grid style={dashed,gray!30},
+      height=6cm,
+      width=9cm,
+      legend cell align={left},
+      legend entries={
+        \texttt{AnonymousWalker},
+        \texttt{HALKWalker(freq\_threshold=0.01)},
+        \texttt{NGramWalker(grams=3)},
+        \texttt{RandomWalker},
+        \texttt{SplitWalker},
+        \texttt{WalkletWalker},
+      },
+      legend style={
+        legend pos=outer north east,
+        font=\small
+      },
+      ylabel={Accuracy},
+      xlabel={Maximum Number of Walks per Entity},
+      xtick={100,250,500},
+      ytick={65.80,68.80,72,73.20,76,77.50,79.10},
+      ]
+
+      \addplot[brown,mark=*,error bars/.cd, y dir=both, y explicit]
+      table[x=max_walk,y=accuracy,col sep=comma] {data/walkers/max-walks/anonymous.csv};
+      \addplot[yellow,mark=*,error bars/.cd, y dir=both, y explicit]
+      table[x=max_walk,y=accuracy,col sep=comma] {data/walkers/max-walks/halk.csv};
+      \addplot[blue,mark=*,error bars/.cd, y dir=both, y explicit]
+      table[x=max_walk,y=accuracy,col sep=comma] {data/walkers/max-walks/ngram.csv};
+      \addplot[green,mark=*,error bars/.cd, y dir=both, y explicit]
+      table[x=max_walk,y=accuracy,col sep=comma] {data/walkers/max-walks/random.csv};
+      \addplot[red,mark=*,error bars/.cd, y dir=both, y explicit]
+      table[x=max_walk,y=accuracy,col sep=comma] {data/walkers/max-walks/split.csv};
+      \addplot[darkPurple,mark=*,error bars/.cd, y dir=both, y explicit]
+      table[x=max_walk,y=accuracy,col sep=comma] {data/walkers/max-walks/walklet.csv};
+    \end{axis}
+  \end{tikzpicture}
+  }%
+  \caption{Evaluation of the Walking Strategies for Different Data Sets
+    According to the Maximum Number of Walks per Entity.}
+  \label{fig:benchmarks:walkers:walks}
+\end{figure}
+
+In Figure \ref{fig:benchmarks:walkers:walks}, the model's accuracy with
+\texttt{SplitWalker} tends to stay around \SI{77}{\percent} after a maximum
+number of walks of 250. In addition, \texttt{AnonymousWalker},
+\texttt{NGramWalker}, and \texttt{RandomWalker} follow the same curve
+trajectory. Specifically, they have a peak of accuracy at a maximum number of
+walks per entity of 250. In contrast, the other walking strategies have a
+decrease of model's accuracy at this stage.
+
+\begin{table}[!ht]
+  \centering
+  \begin{tabular}{lc}
+    \toprule
+    \textbf{Walker} & \textbf{Average Rank} \\
+    \midrule
+    \texttt{HALKWalker(freq\_threshold=0.01)} & 1 \\
+    \texttt{SplitWalker} & 2 \\
+    \texttt{RandomWalker} & 3 \\
+    \texttt{NGramWalker(grams=3)} & 4 \\
+    \texttt{WalkletWalker} & 5 \\
+    \texttt{AnonymousWalker} & 6 \\
+    \bottomrule
+  \end{tabular}
+  \caption{Evaluation of the Average Rank of the Walking Strategies.}
+  \label{tab:benchmark:walkers:average:rank}
+\end{table}
+
+In Figure \ref{tab:benchmark:walkers:average:rank},
+\texttt{HALKWalker(freq\_threshold=0.01)} is the winning walking strategy in
+these benchmarks for \texttt{MUTAG}, followed by \texttt{SplitWalker}, and
+\texttt{RandomWalker}.
+
+%%% Local Variables:
+%%% mode: latex
+%%% TeX-master: "../../master-thesis"
+%%% End:
diff --git a/src/conclusion.tex b/src/conclusion.tex
new file mode 100644
index 0000000..af40ba1
--- /dev/null
+++ b/src/conclusion.tex
@@ -0,0 +1,116 @@
+
+\chapter{Conclusion}
+\label{chap:conclusion}
+
+RDF is a W3C standard that ensures that the data diversity remains
+machine-interpretable by encoding the semantics of these data. Semantic Web and
+Linked Open Data use this standard by interconnecting several sources using KGs,
+which are directed heterogeneous multigraphs composed of triples. Afterward,
+this type of graph is converted into a numerical vector and used for downstream
+ML tasks.
+
+This conversion can be done with RDF2Vec, an unsupervised task-agnostic
+algorithm for numerically representing KG nodes. Since 2016, this algorithm has
+provided good results using Word2Vec, an embedding technique initially used in
+the NLP field. However, the NLP field has made other advances, notably related
+to the Attention mechanism published in 2014 by \textsc{Bahdanau} to solve the
+bottleneck problem of RNNs. Due to this mechanism, the Transformer architecture
+published in 2017 by \textsc{Vaswani} et al. was an alternative to RNNs,
+introducing two new Attention mechanisms: Scaled Dot-Product Attention and
+Multi-Head Attention. This architecture led to the creation of BERT, a
+state-of-the-art NLP embedding technique published in 2018.
+
+Unlike Word2Vec, BERT allows generating contextualized embeddings using
+bidirectional representations. For this purpose, BERT can receive one or two
+sentences which the WordPiece algorithm will then tokenize. WordPiece allows
+BERT, on the one hand, to learn common sub-words and, on the other hand, not to
+replace OOV words with a special token, being a rich source of information. Once
+the text corpus is tokenized, the BERT model is pre-trained using MLM and NSP as
+two unsupervised tasks. This pre-training is useful to mainly help overcome the
+lack of training data and allow the model to understand better a bidirectional
+representation of an input at the sentence and token level.
+
+The objectives of this Master's thesis were to evaluate BERT with Word2Vec and
+FastText and extend RDF2Vec to a new walking strategy and sampling
+strategy. Such an evaluation is essential because few research papers have
+compared the classical BERT model with other embedding techniques. In addition,
+the minor use made of BERT with the KGs is done by creating one of its dedicated
+variances. Finally, creating new strategies allows improving the accuracy of a
+model for specific use-cases by focusing on extracting walks from a KG.
+
+In order to evaluate BERT with FastText and Word2Vec on KGs, a dedicated
+implementation has been proposed. For BERT, it was not possible to use a
+pre-trained model and, therefore, necessary to create this model from
+scratch. For this purpose, the creation of the vocabulary of this new model
+included each special token and the nodes extracted from the walks by a walking
+strategy and a sampling strategy. For the training of BERT, it was first useful
+to tokenize the nodes by adding special tokens to the left and right of each
+node. Then, the pre-training focused only on the MLM since the NSP is of no
+interest since each walk has no semantic links with a second one. Finally, the
+training was done on a data collator dedicated to MLM, and a set of
+hyperparameters was provided, finding a compromise between training time and
+model accuracy.
+
+BERT's evaluation on \texttt{MUTAG} indicated a training time ranging from 25
+minutes to several hours and a generally lower model accuracy than Word2Vec and
+FastText. These results showed that the model's accuracy is proportional to the
+maximum depth per walk and the maximum number of walks. Except for rare cases,
+the use of this BERT model for KGs is not significant enough. The main reason
+being its training time which can be excessively long with results similar to
+Word2Vec and FastText.
+
+This Master's thesis proposed \texttt{SplitWalker} as a new walking
+strategy. Its principle is based on the decomposition of nodes according to a
+splitting function provided by the user. By default, \texttt{SplitWalker} splits
+each node according to its symbols, capital letters, and numbers. When comparing
+this strategy to others with \texttt{MUTAG}, the average rank of the walking
+strategies shows \texttt{SplitWalker} in second place, behind
+\texttt{HALKWalker}. However, a better splitting function could probably have
+improved these results even more.
+
+In addition, a new sampling strategy has been introduced, namely
+\texttt{WideSampler}. This strategy mimics the classification of objects by
+humans, favoring as much as possible the common features between the
+objects. More precisely, \texttt{WideSampler} tends to favor the most frequent
+hops in a KG and those that allow access to more nodes. Moreover, its evaluation
+with \texttt{MUTAG} gives a good model accuracy, allowing it to finish first in
+the benchmarks' average. Added to that, it is likely that \texttt{WideSampler}
+has more impact on larger KGs.
+
+Within this work, it was proposed to improve Word2Vec by extracting the parent
+and child nodes of a root node on the one hand and focusing the positioning of
+this root node within the walks. These recommendations can bring more contexts
+to the root node and improve its frequency of occurrence within the samples
+training. From then on, a better quality generation of root node embeddings.
+
+This Master's thesis had some internal problems that resulted in missing
+benchmarks. These problems were related to the IDLab servers used. Their Stardog
+infrastructure generated significant variants during the benchmarks. More
+precisely, the version of Stardog used had a problem with the garbage collector,
+which prevented the correct memory deallocation. In addition, some benchmarks
+had exceptions thrown due to SPARQL endpoint not being available. This
+unavailability being due to updates and internal problems with the servers.
+
+It was discussed the results obtained. The research papers using BERT pretended
+that the classical BERT model was not interesting with KGs. One of the reasons
+for this is that BERT is limited by its inputs and its architecture is not
+optimized to be used for KGs. Most of the research papers have developed their
+variant of BERT, which can be simple or complex. At most simple, some variants
+have replaced the softmax activation function with a sigmoid activation
+function. The traditional BERT model was only a tiny part of a more significant
+architecture at the most complex.
+
+Research directions for future work have been proposed. Firstly it was
+recommended to evaluate \texttt{SplitWalker} and \texttt{WideSampler} for larger
+KGs to know more about their impact. Added to this, the traditional BERT model
+with larger KGs reinforces the idea that its use is limited. However, another
+test would be not to inject pairs of different walks, but rather pairs of
+\texttt{(subject, object)} 2-tuple to let BERT deduce the predicate. It was said
+that it would also be possible to compare Word2Vec and Fast to other BERT
+variants to know their impact better. In case of bad results, there is also the
+possibility to create a new BERT variant.
+
+%%% Local Variables:
+%%% mode: latex
+%%% TeX-master: "../report"
+%%% End:
diff --git a/src/discussion.tex b/src/discussion.tex
new file mode 100644
index 0000000..4b8ae05
--- /dev/null
+++ b/src/discussion.tex
@@ -0,0 +1,156 @@
+
+\chapter{Discussion}
+\label{chap:discussion}
+
+The results obtained with BERT are less conclusive than those expected. Creating
+a BERT model requires considerable time, taking at least several hours or days
+of training. In addition, BERT has a lower accuracy than Word2Vec and FastText
+and needs to be re-trained for each KG, making its use not pragmatic. However,
+this evaluation does not mean that the use of BERT should be prohibited. Within
+this report, the research focused on three main fields:
+\begin{enumerate}
+\item the walks extraction from a KG;
+\item the implementation of the BERT model for \texttt{pyRDF2Vec};
+\item the fine-tuning of BERT.
+\end{enumerate}
+
+The research papers referred to in Section \ref{section:related:work:bert} are
+related to the implementation of BERT within KGs. However, most of them needed
+to create a new architecture that bears few or many changes compared to
+BERT. Therefore, it is interesting to examine the creation of these
+architectures.
+
+\section{BERT's Architecture}
+\label{sec:discussion:architecture}
+
+Understanding the implementation of the BERT variant architecture in the
+research papers would provide insight into the causes of the inconclusive
+results obtained.
+
+\subsection{KG-BERT}
+\label{chap:discussion:kg-bert}
+
+KG-BERT completes a KG by predicting missing tuples based on other existing
+tuples. The design of this architecture has the particularity that BERT can be
+trained based on extracted triples. For this training purpose, each triple
+injection consists of taking either the name or the description of their
+nodes. In this way, KG-BERT considers every node as a word. However, KG-BERT
+proposes two ways to inject these triples:
+\begin{enumerate}
+\item by varying the relevance of a relation using noise to train BERT to deduce
+the plausibility of this relation;
+\item by using pairs of entities to train BERT to deduce the relation between
+these pairs of entities.
+\end{enumerate}
+
+Based on this idea, this Master's thesis also tested BERT again on MUTAG and
+BGs. However, the results were still not more conclusive than those of Word2Vec
+and FastText.  Looking at the
+KG-BERT\footnote{\url{https://github.com/yao8839836/kg-bert}} implementation in
+more detail, the latter uses a sigmoid function to calculate the triple score
+instead of the original BERT softmax function. In addition, KG-BERT has adapted
+its cross-entropy loss computation by mainly considering the triple
+labels. These adjustments likely played an essential role in the success of
+KG-BERT. However, this BERT variant has not been compared with Word2Vec and uses
+other datasets than those used for this Master's thesis.
+
+\subsection{BERT-INT}
+\label{subsec:discussion:bert-int}
+
+BERT-INT predicts two identical entities across multiple KGs by using the name
+or description of the current entities, but this time, also of those of their
+neighbors. Since there is no propagation of information within neighboring
+entities, BERT-INT has the particularity to ignore the characteristic structure
+of the KGs.
+
+Unlike KG-BERT, which has minor modifications compared to the BERT model,
+BERT-INT uses BERT as a basic representation unit. Specifically, BERT is used to
+generate embeddings of the entity name and description and their attributes,
+including values. From then on, the BERT-INT architecture combines several BERT
+units using a pairwise margin loss to fine-tune the resulting BERT interaction
+model. As a result, the BERT-INT architecture is more consistent. The resulting
+model consists of the name/description-view interaction plus the ones from the
+neighbor-view and the attribute-view.
+
+\subsection{Graph-BERT}
+\label{sec:discussion:graph-bert}
+
+Graph-BERT mainly helps with node classification and graph clustering tasks. The
+main feature of this variant BERT is that it relies solely on the Attention
+mechanism, without the necessity to use graph convolution or aggregation
+operations. To train the model, Graph-BERT uses unbound subgraphs sampled in
+their local contexts to avoid the use of KGs, which can be immense. For this
+purpose, there is an injection of sampled nodes and their local context into the
+model, which is then refined for the corresponding task. Finally, The
+Graph-BERT architecture is composed of five parts:
+\begin{multicols}{2}
+\begin{enumerate}
+\item a linkless subgraph batching;
+\item the node input vector embeddings;
+\item a graph transformer-based encoder;
+\item a representation fusion;
+\item a functional component that generates different output according to the
+  target application task.
+\end{enumerate}
+\end{multicols}
+
+\noindent Each of these parts plays an important role in the success of
+Graph-BERT. In addition, this variant also uses BERT as a small part of its
+architecture.
+
+\subsection{K-BERT}
+\label{subsec:discussion:k-bert}
+
+With Graph-BERT, K-BERT has one of the most advanced architectures compared to
+other BERT variants related to KG. The latter uses four modules:
+\begin{enumerate}
+\item Knowledge layer: injects relevant triples from a KG, converting the
+original sentence into a knowledge-rich sentence tree.
+\item Embedding layer: convert a phrase tree into an embedding representation as
+the basic BERT architecture can do, except that the embedding layer is a
+sentence tree instead of a sequence of tokens.
+\item Seeing layer: uses a visible matrix to restrict the visible area of each
+token. This restriction ensures that an excess of knowledge does not lead to
+changes in the meaning of the original sentence. According to the authors, this
+layer is an integral part of the K-BERT success.
+\item Mask-Transformer layer: allows BERT to receive the visible matrix as input
+using a multiple block stack of mask self-attention blocks.:
+\end{enumerate}
+
+\noindent Even if there are similarities with BERT, K-BERT gets its good
+performance to a more advanced architecture than the one initially presented by
+BERT.
+
+\section{Future Works}
+\label{sec:discussion:futur}
+
+From the set of architectures of the KG-oriented BERT variants stated above,
+KG-BERT is the one that keeps an almost similar architecture to the basic BERT
+architecture. Creating such a variant reinforces the idea that
+applying the BERT model to a KG without modifying its architecture is probably
+not interesting. This Master's thesis focused on only injecting pairs of walks
+to BERT. Probably injecting subject/object pairs as KG-BERT does to train BERT
+to predict predicates would lead to better results. However, given the lack of
+context and a non-suitable internal architecture, it is likely that the
+traditional BERT model would not be sufficient to have a clear improvement of
+model accuracy compared to Word2Vec. In particular, if Word2Vec uses the
+recommendations made by this Master's thesis to include the root node in more
+training samples.
+
+As the objective of this Master's thesis was to evaluate BERT, no new variants
+have been implemented. Such an implementation would probably be more suitable
+for a PhD degree where more research time is allocated. However, this report has
+the privileges to closes several bad leads. It has allowed understanding better
+where its success comes from for its few evaluations within KGs. Added to that,
+none of these papers have evaluated their version of BERT with Word2Vec. Various
+possibilities for future work around this remain open such as evaluating the
+previously stated KG-oriented BERT models with Word2Vec and other embedding
+techniques. In addition, there is nothing to prevent the recreation of a new
+BERT architecture for KGs if this proves necessary in the long run. Finally, it
+may be interesting to evaluate \texttt{SplitWalker} and \texttt{WideSampler} on
+larger KGs and to know their impact compared to other strategies.
+
+%%% Local Variables:
+%%% mode: latex
+%%% TeX-master: "../report"
+%%% End:
diff --git a/src/embedders.tex b/src/embedders.tex
new file mode 100644
index 0000000..a79a221
--- /dev/null
+++ b/src/embedders.tex
@@ -0,0 +1,15 @@
+
+\chapter{Embedding Techniques}
+\label{chap:embedding:techniques}
+
+This chapter explains the Word2Vec, FastText, and Bidirectional Encoder
+Representations from Transformers (BERT) embedding techniques.
+
+\input{src/embedders/word2vec}
+\input{src/embedders/fasttext}
+\input{src/embedders/bert}
+
+%%% Local Variables:
+%%% mode: latex
+%%% TeX-master: "../report"
+%%% End:
diff --git a/src/embedders/bert.tex b/src/embedders/bert.tex
new file mode 100644
index 0000000..e2d44bc
--- /dev/null
+++ b/src/embedders/bert.tex
@@ -0,0 +1,244 @@
+
+\newpage
+
+\section{BERT}
+\label{sec:bert}
+
+Bidirectional Encoder Representations from Transformers (BERT) is a
+state-of-the-art NLP embedding technique published in 2018. BERT relies on the
+encoder part of the Transformer and whose objective is to generate an
+unsupervised language representation model. Unlike Word2Vec, the embeddings
+generated by BERT are contextualized using bidirectional representations. In the
+original paper, two pre-trained BERT models trained on the English Wikipedia
+(\SI{2500}{M} words) and the Toronto BookCorpus (\SI{800}{M} words) are
+available:
+\begin{enumerate}
+\item \textbf{BERT-Base}: 12-layer Transformer, 768-hidden, 12-heads, \SI{110}{M} parameters.
+\item \textbf{BERT-Large}: 24-layer Transformer, 1024-hidden, 16-heads, \SI{340}{M} parameters.
+\end{enumerate}
+
+Since these models are pre-trained based on a corpus, their use fixes the
+learning vocabulary. From then on, as Word2Vec and FastText, BERT could face the
+presence of OOV words. As a result of these unknown words, BERT uses an adaptive
+tokenization algorithm.
+
+\subsection{Tokenization}
+\label{subsec:tokenization}
+
+BERT uses a WordPiece tokenization algorithm for the pre-training of a
+model. This tokenization proposed in 2015 by Google takes its main advantage to
+not replace unknown words with a \texttt{[UNK]} special token, which implies a
+loss of information about the input sequence. As an alternative to this
+conversion, WordPiece uses a segmentation method which breaks down a word into
+several word pieces units prefixed by \texttt{\#\#}\footnote{Except for Chinese
+characters, which are surrounded by spaces before any tokenization.} and
+converted into unique identifiers. From then on, this algorithm allows BERT to
+learn common sub-words and does not considers OOV and rares words. As for the
+vocabulary ($\simeq$ \SI{30 000}{words}), the latter is initializing itself by
+taking the individual characters of a language, and adding the most frequent
+combinations\footnote{For example, ``u'', followed by ``g'', would have only
+been merged if the probability of ``ug'' divided by ``u'', ``g'' would have been
+greater than for any other symbol pair.} of a corpus.
+
+\begin{table}[!ht]
+  \centering
+  \caption{Example of Tokenization With BERT.}
+  \label{tab:bert:tokenization}
+  \begin{tabular}{cccccccccc}
+    & The & machine & loves & embeddings. & & & & & \\
+    \texttt{[CLS]} & the & machine & loves & em & \#\#bed & \#\#ding & \#\#s & . & \texttt{[SEP]} \\
+    101 & 1996 & 3698 & 7459 & 7861 & 8270 & 4667 & 2015 & 1012 & 102 \\
+  \end{tabular}
+\end{table}
+
+In Table \ref{tab:bert:tokenization}, the input sequence of BERT allows
+receiving either one sentence (e.g., for text classification) or two sentences
+(e.g., for question answering) where each of them is limited to 512
+characters. Beyond this limit, it is necessary to truncate the
+sentence. Finally, WordPiece uses three main special tokens:
+\begin{enumerate}
+\item \texttt{[CLS]}: classification token inserted at the beginning of the
+  input used for prediction.
+\item \texttt{[SEP]}: separator token used to indicates the end of each sentence;
+\item \texttt{[PAD]}: padding token used when batching sequences of different lengths.
+\end{enumerate}
+
+\subsection{Input Embeddings}
+\label{subsec:bert:input:embeddings}
+
+Unlike Transformer, BERT accepts one or two sentences as input sentences. From
+then on, BERT encodes these inputs to use them for the model pre-training.
+
+\newpage
+
+\begin{figure}[!ht]
+  \centering
+  \includegraphics[width=\textwidth]{img/embedders/bert/input-embeddings}
+  \caption{Example of Sentence Pair Encodding.}
+  \source{\textsc{Devlin} et al. -- BERT}
+  \label{fig:sentence:pair:encoding}
+\end{figure}
+
+In Figure \ref{fig:sentence:pair:encoding}, the encoding of the BERT's input
+requires the sum of three vector embeddings:
+\begin{enumerate}
+\item \textbf{Token embeddings}: the word pieces embeddings (cf. Section
+\ref{subsec:tokenization}).
+\item \textbf{Segment embeddings}: the embeddings that associate a token with
+its belonging sentence. In the case of a single sentence input sequence, the
+vector contains only unitary values. Otherwise, the \texttt{[SEP]} token and the
+tokens related to the first sentence are assigned a zero value. Those tokens of
+the second sentence are assigned a unitary value.
+\item \textbf{Position embeddings}: embeddings that preserve the contextual
+information of the tokens by injecting their positioning into the input
+embeddings, as the Transformer architecture does.
+\end{enumerate}
+
+\subsection{Pre-training}
+\label{subsec:bert:pre-training}
+
+
+The BERT model's pre-training gives it the ability to learn language by training
+simultaneously on two unsupervised tasks: the \emph{Masked Language Model} (MLM)
+and \emph{Next Sentence Prediction} (NSP). These two tasks mainly help overcome
+the lack of training data and allow the model to understand better a
+bidirectional representation of an input at the sentence and token level.
+
+\subsubsection{Masked Language Model}
+\label{subsubsec:bert:mlm}
+
+MLM is the first task use by the pre-training BERT model. This task helps BERT
+learn bidirectional contextual representations of tokens through the encoder
+layers by predicting masked tokens from the input sequence. For this purpose,
+MLM masks and predicts tokens from the input sequence. Mathematically, the
+following equation defines this prediction of masked tokens.
+
+\begin{equation}
+  p\left(x_1,\dotsc,x_n\right) = \sum_{i = 1}^np\left(x_i|x_1,\dotsc,x_{i - 1}, x_{i + 1},\dotsc, x_n\right)
+\label{eq:def:mlm}
+\end{equation}
+
+where each input sequence generally has \SI{15}{\percent} of its tokens hidden
+according to the following subrules:
+\begin{itemize}
+\item a token is replaced by a \texttt{[MASK]} token in \SI{80}{\percent} of the
+  cases;
+\item a token is replaced randomly in \SI{10}{\percent} of the cases;
+\item a token remains unchanged in \SI{10}{\percent} of the cases.
+\end{itemize}
+
+These subrules prevent the Transformer encoder from being forced to maintain a
+distributive contextual representation of each input token. From then on, it is
+not recommended to modify the default token masking. Too little masking would
+imply a too expensive model to train, and too much masking would mean a lack of
+context for a token.
+
+\subsubsection{Next Sentence Prediction}
+\label{subsubsec:bert:nsp}
+
+The NSP is the second task use by the pre-training BERT model. This task helps
+BERT learn sentence relationships by solving a binary classification problem
+using Attention information shared between sentences.
+\begin{table}[!ht]
+  \centering
+  \caption{Example of NSP With BERT.}
+  \label{tab:bert:nsp}
+  \begin{tabular}{lll}
+    \textbf{Sentence $\mathcal{A}$} & \textbf{Sentence $\mathcal{B}$} & \textbf{Label} \\
+    GNU Emacs is the best text editor. & I should use it. & \texttt{isNextSentence} \\
+    GNU Emacs is the best text editor. & I have a cat. & \texttt{NotNextSentence} \\
+  \end{tabular}
+\end{table}
+
+In Table \ref{tab:bert:nsp}, NSP allows the model to train itself to predict
+whether sentence $\mathcal{B}$ is a continuation of sentence $\mathcal{A}$. For
+this purpose, the generation of the input sequences ensures continuity of
+$\mathcal{A}$ \SI{50}{\percent} of the time, labeled as \texttt{isNext}. The
+rest of the time, $\mathcal{B}$ is related to a random sentence from the
+provided training data set, labeled as \texttt{NotNextSentence}.
+
+\subsubsection{Output}
+\label{subsubsec:output}
+
+Once the BERT model understands the language representation, this model is
+suitable for most NLP tasks. These tasks include Neural Machine Translation,
+question answering, sentiment analysis, and text summarization.
+\begin{figure}[!ht]
+  \centering
+  \includegraphics[width=0.6\textwidth]{img/embedders/bert/pre-training}
+  \caption{Pre-Training With BERT.}
+  \source{\textsc{Devlin} et al. -- BERT}
+  \label{fig:pre-training}
+\end{figure}
+
+In Figure \ref{fig:pre-training}, BERT produces the hidden states of each input
+token where each hidden state consists of a vector of the same size (e.g., 768
+for BERT-Base) as the others, containing float numbers.  Among these hidden
+states, the first position is related to the hidden state of the token
+\texttt{[CLS]}. This hidden state is interesting as it determines the continuity
+of two sentences, which can later be used for fine-tuning tasks.
+
+Furthermore, the hidden states pass through a last FFN containing as many
+neurons as tokens in the vocabulary used. The pre-training phase ends by
+obtaining probability distribution on the hidden states using a softmax
+activation function at the output of this FFN. Finally, BERT compares the
+distribution of the current one-hot encoded vector token with the predicted word
+and train the network using cross-entropy. It is important to note that the loss
+only considers the prediction of masked tokens produced by the network to raise
+awareness of the context during the network's training.
+
+\subsection{Fine-Tuning}
+\label{subsec:bert:fine-tuning}
+
+The fine-tuning consists of training the BERT model on a specific NLP
+task. Depending on the use case, either the final hidden state of the
+\texttt{[CLS]} token or the hidden of the other tokens will be taken. The former
+is use for classification-related tasks and the latter for more complex tasks
+such as the Stanford Question Answering Dataset (SQuAD), Named Entity
+Recognition (NER), and Multi-Genre Natural Language Inference (MNLI).
+\begin{figure}[!ht]
+  \centering
+  \includegraphics[width=0.7\textwidth]{img/embedders/bert/fine-tuning}
+  \caption{Fine-Tuning for SQuAD.}
+  \source{\textsc{Devlin} et al. -- BERT}
+  \label{fig:bert:fine-tuning}
+\end{figure}
+
+In Figure \ref{fig:bert:fine-tuning}, the fine-tuning of the model for SQuAD
+takes as input sequence a question and a paragraph containing the answer to this
+question. Regarding the model's output for this NLP task, BERT returns the
+answer to a submitted question. For this purpose, BERT highlights the starting
+and ending word of a given paragraph that includes the answer to that question
+only if that answer is in the paragraph. Therefore, it is interesting to look at
+the probability that a word is the start/end of the answer span.
+
+\begin{definition}[Word Probability -- Start/End of the Answer Span]
+  Let $S \in \mathbb{R}^H$ be the start vector, $E \in \mathbb{R}^H$ be the end
+  vector, $T_i \in \mathbb{R}^H$ be the final hidden vector for the
+  i\textsuperscript{th} input token, and $j$ be the position of the ending
+  word. Mathematically, the following two equations represent the probability that
+  the word $i$ is the start/end of the answer span.
+  \begin{equation}
+    \mathrm{PS_i} = \frac{e^{ST_i}}{\sum_je^{ST_j}} \qquad \mathrm{PE_i} = \frac{e^{ET_i}}{\sum_je^{ET_j}}
+    \label{eq:def:bert:word:probabilities}
+  \end{equation}
+
+  where the sum of $ST_i$ and $ET_j$ defines the score of a candidate span from
+  position $i$ to position $j$. Afterward, the maximum scoring span where $j \geq
+  i$ is used as a prediction.
+  \label{def:squad:word:probability}
+\end{definition}
+
+The starting and ending tokens containing the answer are determined using the
+softmax activation function on the dot product between the output embeddings and
+the set of weights. From then on, the word with the highest probability is
+assigned as the start word, and the process continues to iterate to determine
+the end word. As most hyperparameters are similar to those of the pre-training,
+the only new hyperparameters added during the fine-tuning concern a
+classification layer. From then on, an exhaustive search of values can be done
+to choose the best model according to these hyperparameters.
+
+%%% Local Variables:
+%%% mode: latex
+%%% TeX-master: "../../report"
+%%% End:
diff --git a/src/embedders/fasttext.tex b/src/embedders/fasttext.tex
new file mode 100644
index 0000000..a594781
--- /dev/null
+++ b/src/embedders/fasttext.tex
@@ -0,0 +1,146 @@
+
+\section{FastText}
+\label{sec:fasttext}
+
+FastText is an extension to Word2Vec created by Facebook in 2016, based on the
+decomposition of a word into character $n$-grams to improve the embeddings
+obtained by an SG model. Unlike Word2Vec, which treats each word in a corpus as
+an atomic entity, the decomposition made by FastText mainly solves the
+embeddings creation for OOV words and parameter sharing between words of the
+same radical. Finally, FastText can be used for unsupervised learning and
+supervised learning with text classification for tasks such as spam filtering.
+
+\subsection{Sub-Word Generation}
+\label{subsec:fasttext:sub-word}
+
+The generation of sub-words called $n$-grams is an integral part of the success
+of FastText. Therefore, it is good to know the details of such a generation.
+
+\begin{definition}[Sub-word Generation]
+  Consists of adding angular brackets on either side of a word used as a
+  delimiter and generating character $n$-grams of length $n$. In practice, picking
+  this length allows extraction of $n$-grams $\geq$ 3 and $\geq
+  6$~\citep{DBLP:journals/tacl/BojanowskiGJM17}. Let ``flying'' be a word. The
+  following table illustrates the sub-word generation for character $n$-grams of
+  length 3, 4, 5, and 6.
+
+  \begin{table}[!ht]
+    \centering
+    \caption{Sub-word Generation for Character $N$-Grams of Length 3, 4, 5, and 6.}
+    \label{tab:fasttext:sub-word}
+    \begin{tabular}{ccl}
+      \toprule
+      \textbf{Word} & \textbf{Length} & \textbf{Character $n$-grams} \\
+      \midrule
+      \multirow{4}{*}{flying} & 3 & <fl, fly, lyi, yin, ing, ng> \\
+                    & 4 & <fly, flyi, lyin, ying, ing> \\
+                    & 5 & <flyi, lying, ying> \\
+                    & 6 & <flyin, flying, lying> \\
+      \bottomrule
+    \end{tabular}
+  \end{table}
+
+  In Table \ref{tab:fasttext:sub-word}, each character $n$-grams of length $n$
+  is generated by sliding a 2-characters window from the beginning of the bracket
+  to its end.
+  \label{def:sub-word:generation}
+\end{definition}
+
+As a result, the vector of a word corresponds to the sum of these $n$-grams of
+characters. This word decomposition into character $n$-grams comes at a storage
+cost since storing all the unique $n$-grams can quickly be significant. The
+original paper uses a variant of the Fowler-Noll-Vo (FNV) hash function, called
+\emph{FNV-1a}, to hash character sequences into integer values to reduce this
+memory cost. The use of FNV is helpful for this use case since FNV was not
+designed for cryptography but for the fast use of hash tables and
+\emph{checksums}. The checksum results of performing a cryptographic hash
+function on a piece of data, usually a file, to ensure that the data is
+authentic and error-free. Moreover, it is necessary to learn a certain amount of
+embeddings to hash character sequences. Specifically, this amount designates the
+hash bucket's size to better distribute these character $n$-grams for sorting
+and retrieval purposes. From then on, the hashing of each $n$-gram to a number
+between one and $N$, reduces the vocabulary size in the counterpart of potential
+\emph{collisions}. This hash collision means that several character $n-grams$
+stored as a key can result in the same hash and checksum. Therefore no longer
+ensure the authenticity of a value.
+
+In some circumstances, the model size may be excessive. In this case, it is
+still possible to reduce the hash size where the appropriate value is near
+$20000$. However, it is also possible to reduce the size of the vectors at the
+expense of smaller model accuracy.
+
+\subsection{Training}
+\label{subsec:fasttext:training}
+
+Like Word2Vec, the training of FastText’s SG model can use HSM or Negative
+sampling. To better understand its application with negative sampling, one can
+take the following ``I always remember her'' sentence is taken. Let ``remember''
+be the target word, using 3-grams and a window size of 3. This example predicts
+the ``always'' and ``her'' context words. First of all, this model computes the
+embeddings of the target word by summing the embeddings for the character
+$n$-grams and the whole word itself:
+\begin{align}
+  \mathrm{E}_{remember} = e_{<re} + e_{rem} + \dots + e_{er>} + e_{remember}
+  \label{eq:fasttext:embeddings}
+\end{align}
+
+Afterward, each context word is taken directly from their embeddings without
+adding the character $n$-grams. Then, several random negative samples are selected
+with a probability proportional to the square root of the unigram
+frequency. From then on, a context word is related to five random negative
+words. In the absence of $n$-gram embeddings, the Word2Vec model is used,
+specifying a maximum $n$-gram length of zero.
+
+After selecting samples, the dot product between the target word and the actual
+context words computes the probability distribution. Unlike Word2Vec, there is
+this time use of the sigmoid function instead of the softmax function. Finally,
+the embeddings are updated with an SGD optimizer according to the calculated
+loss to bring the actual context words closer to the target word. Afterward,
+there is an incrementation in the distance of the negative samples.
+
+This update of hyper-parameters for embeddings of $n$-grams adds a significant
+amount of extra computation to the training process. Moreover, CBOW generates
+the word embeddings by summing and averaging $n$-grams embeddings, which adds cost
+compared to SG. However, these additional calculations benefit from a set of
+embeddings containing the sub-words embeddings. From then on, they allow a
+better accuracy of a model in most cases.
+
+\subsection{Advantages and Disadvantages}
+\label{subsec:fasttext:pro:cons}
+
+In a non-exhaustive way, the advantages of the FastText:
+\begin{multicols}{2}
+\begin{itemize}
+\item The use of unsupervised learning and supervised learning with text
+classification for tasks such as spam filtering.
+\item Captures the meaning of suffixes/prefixes for the words given in a corpus.
+\item Generation of better word embeddings for rare words as their character
+$n$-grams are shared with other more frequent words, adding more neighboring
+contextual words and improving its probability of being selected.
+\columnbreak
+  \item Generation of word embeddings for OOV words.
+\item Better semantic similarity between two words (e.g., king and kingdom) by
+using extra information about the sub-words.
+\item No compromise of accuracy when stopwords are present.
+\item Significant improvement in model accuracy when used to perform syntactic
+word analogy tasks for morphologically rich languages (e.g., French and German).
+\end{itemize}
+\end{multicols}
+
+As for its main disadvantages, they are the following:
+\begin{multicols}{2}
+\begin{itemize}
+\item Requires much more RAM than Word2Vec due to the sub-words generation for
+each word.
+\item Lower accuracy compared to Word2Vec when a model is used on semantic
+analogy tasks.
+\item FastText is \SI{50}{\percent} slower to train than the regular Skip-Gram model due
+to the added overhead of $n$-grams compared to Word2Vec.
+\item Difficulty in determining the best value for the $n$-grams generation.
+\end{itemize}
+\end{multicols}
+
+%%% Local Variables:
+%%% mode: latex
+%%% TeX-master: "../../report"
+%%% End:
diff --git a/src/embedders/word2vec.tex b/src/embedders/word2vec.tex
new file mode 100644
index 0000000..a5cb943
--- /dev/null
+++ b/src/embedders/word2vec.tex
@@ -0,0 +1,467 @@
+
+\section{Word2Vec}
+\label{sec:w2v}
+
+Word2Vec is an unsupervised learning algorithm published in 2013, which, based
+on a corpus of text, provides a vector representation for each word of that
+text, using a two-layer neural network. This algorithm was initially implemented
+to learn word representations from large data sets efficiently but emerged as a
+standard embedding technique. According to the use case, two models are
+available: Continuous Bag-Of-Words (CBOW) and Skip-Gram (SG).
+
+\subsection{Continuous Bag-of-Words Model}
+\label{subsec:w2v:cbow}
+
+CBOW is an unsupervised model that predicts a target word based on context words
+and window size. From a semantic point of view, the model is named
+``Bag-of-Words'' since the order of the context words does not influence its
+training.
+\begin{figure}[!ht]
+  \centering
+  \includegraphics[width=0.75\textwidth]{img/embedders/w2v/cbow}
+  \caption{CBOW Model Architecture.}
+  \source{
+    \href{https://lilianweng.github.io/lil-log/2017/10/15/learning-word-embedding.html}
+    {Lilian Weng -- Learning Word Embedding}
+  }
+  \label{fig:w2v:cbow:architecture}
+\end{figure}
+
+In Figure \ref{fig:w2v:cbow:architecture}, the CBOW model architecture starts by
+taking one or several targets word as input, represented as a $V$-dimensional
+one-hot encoding vector. $V$ is the dictionary's size of unique words present in
+a text or in a training data set. Using a dot product, this/these input
+vector(s) is then multiplied by a first $W$ weight matrix, named embedding
+matrix. This matrix is of dimension $V \times N$, where $N$ is the number of
+features that each unique word in this vocabulary has defining the
+\emph{embedding size}. Therefore, these dot products produce an $N$-dimensional
+hidden vector for each input vector. However, if there multiple input context
+vectors, these hidden vectors are then averaged element-wise into a single
+hidden vector, where this vector chooses the size of the vectors that will be
+used later.
+
+According to the semantics of the words of the dictionary, a new dot product is
+made, but this time, between the calculated hidden vector and another $W'$
+weight matrix, named \emph{context matrix}. This matrix being now of dimension
+$N \times V$. Then, their product generates an output vector of dimension $V$
+subjected to a softmax activation function. Therefore, this function ends by
+calculating a probability distribution and returns the probability that a word
+is a target word for the given context words. Finally, a cross-entropy loss
+function is applied to compute the loss between the true probability
+distribution of the given target word and the calculated model probability.
+
+\begin{definition}[Cross-Entropy]
+  Measures the difference between two probability distributions for a given
+  random variable. Let $p$ be a true probability distribution, $q$ be a computed
+  model probability, and $c$ be the predicted result by an ML
+  algorithm. Mathematically, the cross-entropy loss can be defined as follows:
+  \begin{equation}
+    H(p, q) = -\sum_{c=1}^Cp(c)\log\left(q(c)\right)
+    \label{eq:cross-entropy}
+  \end{equation}
+  \label{def:cross-entropy}
+\end{definition}
+
+After computation with the cross-entropy loss, the back-propagation updates the
+weights of the embedding matrix according to this loss. These steps are then
+repeated with other context words and another target word for a specified number
+of epochs. Once the training is done, the embedding matrix is used to generate
+the word embeddings from the one-hot encodings. Formally, given a sequence of
+training words $w_1, w_2, \dotsc, w_t$, and a context window $c$, the objective
+of the CBOW model is to maximize the average log probability:
+\begin{equation}
+  \frac{1}{T}\sum_{t=1}^T\log\left(p\left(w_t|w_{t-c}, \ldots, w_{t + c}\right)\right)
+  \label{eq:w2v:cbow:probability}
+\end{equation}
+
+where $T$ is the number of training samples, and the probability
+$p\left(w_t|w_{t-c}, \ldots, w_{t + c}\right)$ is computed using the softmax
+function (cf. Definition \ref{def:softmax}):
+\begin{equation}
+  p\left(w_t|w_{t-c}, \ldots, w_{t + c}\right) =
+  \frac{\exp\left(v^{-T}v'_{w_t}\right)}{\sum_{w=1}^W\exp\left(v^{-T}v'_w\right)}
+  \label{eq:w2v:cbow:probability:details}
+\end{equation}
+
+where $W$ is the vocabulary size, $v'_w$ is the output vector of the $w$ word,
+and $v$ is the averaged input vector of all the context words:
+\begin{equation}
+  \overline{w}_t = \frac{1}{2c}\sum_{-c\leq j \leq c,j \neq 0}w_{t + j}
+  \label{eq:w2v:cbow:probability:details2}
+\end{equation}
+
+\noindent Finally, the whole network is not used regarding the generation of embeddings,
+but only its first layer.
+
+\subsection{Skip-Gram Model}
+\label{subsec:w2v:sg}
+
+SG is an unsupervised model that predicts context words from a target word,
+according to window size and a sentence. Unlike CBOW, these context words are
+defined as multiple pairs of words (cf. Table \ref{def:window:size}), serving as
+training samples for this model. Such training allows this model to learn word
+vector representations suitable for predicting nearby words within a text,
+except for common words and stop words (cf. Section
+\ref{subsec:w2v:sfw}). Finally, SG compresses the information in a low
+dimensional space, such that this model learns a continuous low dimensional
+representation for the words.
+
+The window size for each target word is arbitrarily chosen between one and a
+small positive number. This choice allows to add randomness during training and
+improves future predictions. In the case of a window size greater than two, the
+context words for a target word likely are more than two. If such a situation
+occurs, SG randomly picks a word from these word contexts to form a pair with
+the context word. Then, this model counts the number of times this pair of words
+appear, and the model starts to be trained.
+
+Similar to CBOW, given a sequence of target words $w_1, w_2, \ldots, w_t$, and a
+context window $c$, the objective of the SG model is to maximize the average log
+probability:
+\begin{equation}
+  \frac{1}{T}\sum_{t=1}^T\sum_{-c \leq j \leq c, j \neq 0}\log\left(p\left(w_{t+j}|w_t\right)\right)
+  \label{eq:w2v:sg:probability}
+\end{equation}
+
+where $T$ is the number of training samples, and the probability $p\left(w_{t +
+j}|w_t\right)$ is also computed using the softmax function (cf. Definition
+\ref{def:softmax}):
+\begin{equation}
+  p\left(w_o|w_i\right) = \frac{\exp\left(u_{w_o}^Tv_{w_i}\right)}{\sum_{w=1}^W\exp\left(u_w^Tv_{w_i}\right)}
+  \label{eq:w2v:sg:probability:details}
+\end{equation}
+where $W$ is the vocabulary size, $v$ the input representation of a word, and
+$u$ the output representation of a word.
+
+\begin{figure}[!ht]
+  \centering
+  \includegraphics[width=0.8\textwidth]{img/embedders/w2v/skip-gram}
+  \caption{SG Model Architecture.}
+  \source{
+    \href{https://lilianweng.github.io/lil-log/2017/10/15/learning-word-embedding.html}
+    {Lilian Weng -- Learning Word Embedding}
+  }
+  \label{fig:w2v:sg:architecture}
+\end{figure}
+
+In Figure \ref{fig:w2v:sg:architecture}, the SG model architecture can be seen
+as the inverse of the CBOW architecture. With this architecture, instead of
+having context words and predicting a target word, it takes two words. From then
+on, a model will predict a target word according to a context word. In addition,
+since there is only one input word, the embedding matrix directly generates a
+hidden layer with no need to do an average. Finally, the cross-entropy loss
+function is applied this time to compute the loss between the true probability
+distribution of the given context word and the calculated model probability.
+
+\subsection{Subsampling Frequent Words}
+\label{subsec:w2v:sfw}
+
+Subsampling frequent words is a technique whose objective is to reduce the
+number of training examples for a Word2Vec model. To accomplish this, it assumes
+that predicting context words that are too frequent (e.g., ``the'') provides
+little semantic value to differentiate a context
+~\citep{inproceedings:mikolov}. In contrast, infrequent words are more likely to
+convey specific information. Therefore, to improve the balance between
+infrequent and frequents words, this technique randomly eliminates words from a
+more frequent corpus than a certain threshold. This suppression occurs before a
+corpus is processed in word-context pairs.
+
+Let $w_i$ be a word, $f_{w_i}$ be a word frequency in a corpus, and $t$ be a
+chosen threshold, typically around $10^{-5}$. Mathematically, the discarding of
+a word from a corpus is done as follows:
+\begin{equation}
+  P(w_i) = 1 - \sqrt{\frac{t}{f(w_i)}}
+  \label{eq:w2v:sfw}
+\end{equation}
+
+Therefore, each word $w_i$ present in a text for a window of size $c$ has a
+probability of being deleted, where each deleted word reduces the training
+samples by most $c$ times. Therefore, words whose frequency is above this
+threshold, will be aggressively subsampled, preserving the frequency
+ranking. However, even though this formula speeds up learning and even
+significantly improves the accuracy of learned vectors of infrequent
+words~\citep{inproceedings:mikolov}, the Word2Vec implementation uses another
+more elaborate formula to discard a word:
+\begin{equation}
+  P(w_i) = \left(\sqrt{\frac{z(w_i)}{t}} + 1\right)\frac{t}{z(w_i)}
+  \label{eq:w2v:sfw:real}
+\end{equation}
+
+where $t$ has a default threshold of $10^{-3}$ and $z(w_i)$ is the fraction of
+the total words in the corpus that corresponds to this word. For example, if the
+``cat'' word occurs 1000 times in a billion words corpus, then $z($``cat''$) =
+10^{-6}$. Therefore, $P(w_i) = 1$ when $z(w_i) \leq 0.0032z(w_i) \leq 0.0032$
+means to keep every instance of a word that represented \SI{0.32}{\percent} or
+less and subsampled those that have a higher percentage. Please note that
+$P(w_i)$ does not correspond to a probability since it is no longer bounded
+between 0 and 1. Finally, from the accuracy point of view, subsampling frequent
+words can improve some accuracy and decrease one of the others.
+
+\subsection{Hierarchical Softmax}
+\label{subsec:w2v:hsm}
+
+Hierarchical Softmax (HSM) is an efficient softmax approximation technique that
+uses a multilayer binary tree structure to reduce the computational cost of
+training a softmax neural network~\citep{DBLP:conf/aistats/MorinB05}. In this
+data structure, each node without child nodes, called \emph{leaf nodes},
+corresponds to a word and each internal node stands for relative probabilities
+of the children nodes.
+
+For better accuracy, HSM structures the Word2Vec vocabulary using a
+\textsc{Huffman} tree, a binary tree data structure where data is stored in leaf
+nodes, without any particular order. To structure this vocabulary, HSM uses this
+tree so that frequent words are closer to the root node of the tree while
+infrequent words are deeper in this tree. Therefore, more frequent words have a
+greater weight than infrequent words, where the \emph{path length} is
+proportional to the frequency of a word. The path length of a tree is defined as
+the number of nodes that must be traversed to reach a specific
+word. Consequently, a \textsc{Huffman} tree always has $n$ leaf nodes for $n-1$
+internal nodes, where each node is characterized by a weight defined by a
+numerical identifier. As a result, the weight of each parent node is equal to
+the sum of the lower weights of its children.
+
+To compute the probability distribution of reaching a word, HSM uses a sigmoid
+activation function and two matrices. One for the inputs and one for the outputs
+differ on the values stored in each row.
+\begin{table}[!ht]
+  \centering
+  \begin{subtable}[b]{0.5\textwidth}
+    \centering
+    \begin{tabular}{cc}
+      \toprule
+      \textbf{Target Word} & \textbf{Context Words} \\
+      \midrule
+      \dots & \dots, \dots \\
+      sleep & sleep, cat \\
+      \bottomrule
+    \end{tabular}
+    \caption{Input Matrix}
+  \end{subtable}%
+  \begin{subtable}[b]{0.5\textwidth}
+    \centering
+    \begin{tabular}{ccc}
+      \toprule
+      \textbf{Node} & \textbf{Sigmoid Value} &  \textbf{Label} \\
+      \midrule
+      \dots & \dots & \dots \\
+      14 & 0.84 & 1 \\
+      20 & 0.23 & 0 \\
+      \bottomrule
+    \end{tabular}
+    \caption{Output Matrix}
+  \end{subtable}
+  \caption{HSM Matrices for the SG Model.}
+  \label{fig:w2v:hsm:matrices}
+\end{table}
+
+In Table \ref{fig:w2v:hsm:matrices}, the input matrix of a SG model contains
+training samples for different target words. On the other hand, the output
+matrix has rows of node weights related to labels and a probability determined
+by the sigmoid function. This label, which can only have two values (zero or
+one), helps informs navigation in the tree based on a node. By convention, zero
+means to browse the left branch of this tree and one, the other
+branch. Therefore, the output matrix reports that the ``cat'' context word is
+found for the ``sleep'' target word by turning left at node 20 and turning right
+at node 14.
+
+\begin{figure}[!ht]
+  \centering
+  \resizebox{0.6\textwidth}{!}{%
+    \begin{tikzpicture}
+      \node[entity,minimum size=0.9cm,fill=myblue] (node_20) at (0,0) {20};
+
+      \node[entity,minimum size=0.9cm,fill=myblue] (node_14) at (-2.5,-1) {14};
+      \node[entity,minimum size=0.9cm,fill=myblue] (node_6) at (2.5,-1) {6};
+
+      \node[entity,minimum size=0.9cm,fill=myblue] (node_dog) at (-4,-2.5) {8};
+      \node[circle,draw,fill=myred] (node_cat) at (-1,-2.5) {9};
+
+      \node[entity,minimum size=0.9cm,fill=myblue] (node_rabbit) at (1,-2.5) {4};
+      \node[entity,minimum size=0.9cm,fill=myblue] (node_2) at (4,-2.5) {2};
+
+      \node[entity,minimum size=0.9cm,fill=myblue] (node_horse) at (2.5,-4) {1};
+      \node[entity,minimum size=0.9cm,fill=myblue] (node_fish) at (5.5,-4) {1};
+
+      \draw[color=red,text=black] (node_20) -- (node_14) node[edge] {\textcolor{red}{0}};
+      \draw (node_20) -- (node_6) node[edge] {1};
+
+      \draw[color=red,text=black] (node_14) -- (node_cat) node[edge] {\textcolor{red}{1}};
+      \draw (node_14) -- (node_dog) node[edge] {0};
+
+      \draw (node_6) -- (node_rabbit) node[edge] {0};
+      \draw (node_6) -- (node_2) node[edge] {1};
+
+      \draw (node_2) -- (node_horse) node[edge] {0};
+      \draw (node_2) -- (node_fish) node[edge] {1};
+
+      \node[label,below of=node_cat] {\textcolor{red}{\textbf{cat}}};
+      \node[label,below of=node_dog] {dog};
+      \node[label,below of=node_rabbit] {rabbit};
+      \node[label,below of=node_horse] {horse};
+      \node[label,below of=node_fish] {fish};
+    \end{tikzpicture}
+    }%
+  \caption{\textsc{Huffman} Tree for Word2Vec.}
+  \label{fig:w2v:hsm:huffman}
+\end{figure}
+
+In Figure \ref{fig:w2v:hsm:huffman}, a vocabulary of five words is structured in a
+\textsc{Huffman} tree containing four internal nodes, where each edge indicates a
+label. This training aims to teach the model to navigate in the tree by finding
+a context word based on a target word. For this search, the tree is traversed
+from top to bottom, where HSM does the dot product between the target word
+vector and the current node. Afterward, the result is sent to a sigmoid
+activation function which returns a probability value. According to this value,
+HSM checks that this probability is correct based on a label and updates the
+neurons' weights if needed. This process is repeated until the word context for
+a target word is found. Since training is not done on all words, there is no
+need to traverse every tree node. As a result, the complexity of the gradient
+goes from $\mathcal{O}(W)$ to $\mathcal{O}(\log_2(w))$. Once the training is
+completed, the SG model (the same reasoning is used for CBOW) can use this tree
+to predict a context word for a target word, where this time the label is not
+specified in the output matrix.
+
+\begin{figure}[!ht]
+  \centering
+    \resizebox{0.6\textwidth}{!}{%
+    \begin{tikzpicture}
+      \node[entity,minimum size=0.9cm,fill=myblue] (node_20) at (0,0) {20};
+
+      \node[entity,minimum size=0.9cm,fill=myblue] (node_14) at (-2.5,-1) {14};
+      \node[entity,minimum size=0.9cm,fill=myblue] (node_6) at (2.5,-1) {6};
+
+      \node[entity,minimum size=0.9cm,fill=myblue] (node_dog) at (-4,-2.5) {8};
+      \node[entity,minimum size=0.9cm,fill=myblue] (node_cat) at (-1,-2.5) {9};
+
+      \node[entity,minimum size=0.9cm,fill=myred] (node_rabbit) at (1,-2.5) {4};
+      \node[entity,minimum size=0.9cm,fill=myblue] (node_2) at (4,-2.5) {2};
+
+      \node[entity,minimum size=0.9cm,fill=myblue] (node_horse) at (2.5,-4) {1};
+      \node[entity,minimum size=0.9cm,fill=myblue] (node_fish) at (5.5,-4) {1};
+
+      \draw (node_20) -- (node_14) node[edge] {0.73};
+      \draw[color=red,text=black] (node_20) -- (node_6) node[edge] {\textcolor{red}{0.27}};
+
+      \draw (node_14) -- (node_cat) node[edge] {0.55};
+      \draw (node_14) -- (node_dog) node[edge] {0.45};
+
+      \draw[color=red,text=black] (node_6) -- (node_rabbit) node[edge] {\textcolor{red}{0.62}};
+      \draw (node_6) -- (node_2) node[edge] {0.38};
+
+      \draw (node_2) -- (node_horse) node[edge] {0.81};
+      \draw (node_2) -- (node_fish) node[edge] {0.19};
+
+      \node[label,below of=node_cat] {cat};
+      \node[label,below of=node_dog] {dog};
+      \node[label,below of=node_rabbit] {\textcolor{red}{\textbf{rabbit}}};
+      \node[label,below of=node_horse] {horse};
+      \node[label,below of=node_fish] {fish};
+    \end{tikzpicture}
+    }%
+    \caption{\textsc{Huffman} Tree for Word2Vec.}
+  \label{fig:w2v:hsm:huffman:prediction}
+\end{figure}
+
+In Figure \ref{fig:w2v:hsm:huffman:prediction}, the tree edges provide the
+number of occurrences that a context word is to the left or right of a node,
+according to a given target word. After training this SG model for training
+data, the output matrix indicates that \SI{27}{\percent} of the time, the
+context word for the same target word was in the right branch of the root node,
+and \SI{73}{\percent} of the time, this context word was left branch. Based on
+this principle, the other nodes also return a probability distribution for their
+left and right branches that varies according to the input vector of a target
+word.
+
+Therefore, the probability of having ``rabbit'' as a context word for the
+``sleep'' target word is equal to the product of the probabilities ($\simeq$
+\SI{17}{\percent}). Added to the probability distribution, each word in the
+vocabulary guarantees that its sum is unitary. Finally, through HSM, Word2Vec
+can retrieve word input vectors where the probability distribution of two
+synonymous words will have a cosine similarity close to one.
+
+\subsection{Negative Sampling}
+\label{subsec:w2v:ns}
+
+Negative sampling is an alternative technique to HSM, which also reduces the
+training speed but combined with Subsampling Frequent Words, it can improve the
+word embeddings' quality. To achieve this, \textsc{Mikolov} et al. assume that
+updating all the neurons of a ANN for each training sampling is computationally
+expensive. From then on, negative sampling focuses on making each training
+sample change only a tiny percentage of the weights rather than all of
+them~\citep{mccormick}. However, with or without negative sampling, the hidden
+layer only updates the input word weights.
+
+The way negative sampling works can be considered a simplified version of the
+Noise Contrastive Estimation (NCE) metric, whose purpose of NCE is to learn a
+data distribution by comparing it against a noise distribution. In the context
+of negative sampling, the latter differentiates a target word from noise samples
+using a logistic regression classifier~\citep{DBLP:journals/jmlr/GutmannH10}.
+
+The words being stored in one-hot vectors, this technique updates the weights of
+the ``positive'' and ``negative'' words. A word is considered positive if it was
+initially in the training sample, while a word is negative if it is considered
+noise. Specifically, this noise results from the return of the zero value in
+these one-hot vectors by the ANN. A small number makes selecting these negative
+words of random words using a ``unigram distribution'', where the most frequent
+words are more likely to be chosen as negative samples~\citep{mccormick}.
+
+Let $w_i$ be a word from a corpus, $f_{w_i}$ be a word frequency, and $w_j$ be a
+total number of negative words in the corpus. The probability that a $w_i$ word
+is selected as a negative word is defined as follows:
+\begin{equation} P(w_i) = \frac{f(w_i)}{\sum_{j=0}^nf(w_j)}
+  \label{eq:w2v:ns:selection:intro}
+\end{equation}
+
+However, the published version increases the number of words to 3/4 power to
+provide better results. The following formula also increases the probability of
+infrequent words and decreases the likelihood of frequent words:
+\begin{equation} P(w_i) = \frac{f(w_i)^{3/4}}{\sum_{j=0}^nf(w_j)^{3/4}}
+  \label{eq:w2v:ns:selection}
+\end{equation}
+
+In practice, Word2Vec implements this negative sampling selection by filling a
+table called \emph{unigram table}, including the index of each word present in a
+vocabulary~\citep{mccormick}. Then, the selection of a negative sample is made
+by generating a random index, where the index of a word appears $P(w_i) *
+{table}_{\text{size}}$ times in a table. Therefore, the more frequent words are
+more likely to be negative words. Finally, for the choice of the number of
+negative words, it is recommended to take five to twenty words for small data
+sets instead of large data sets where it is preferable to take two to five
+words.~\citep{inproceedings:mikolov}.
+
+\subsection{Advantages and Disadvantages}
+\label{subsec:w2v:pro:cons}
+
+In a non-exhaustive way, the advantages of the Word2Vec are:
+\begin{itemize}
+\item The use of unsupervised learning and can therefore work on any plain text.
+\item Requires less RAM compared to other words/vectors representations.
+\end{itemize}
+
+As for its main disadvantages, they are the following:
+\begin{multicols}{2}
+\begin{itemize}
+\item No embeddings are available for Out of Vocabulary (OOV) words. Therefore,
+if Word2Vec has trained with the ``cat'' and ``fish'' words, it will not generate
+the embedding of the ``catfish'' compound
+\item No suffixes/prefixes meaning capture for given words in a corpus.
+\columnbreak
+\item Generation of low quality word embeddings for rare words.
+\item Difficulty in determining the best value for the dimensionality of the
+  word vectors and the window size.
+\item The use of the softmax activation function is computationally expensive.
+\end{itemize}
+\end{multicols}
+
+For the choice of the model, it is recommended to use SG when it is essential to
+predict infrequent words with a small amount of training
+data~\citep{inproceedings:mikolov}. However, CBOW is preferable for the
+prediction of frequent words. Added to that, SG performs significantly on
+semantic accuracy tests (e.g., ``Athens'' $\rightarrow$ ``Greece'' and ``Oslo''
+$\rightarrow$ ``Norway''). At the same time, CBOW offers better results in
+syntactic accuracy tests (e.g., ``apparent'' $\rightarrow$ ``apparently'' and
+``rapid'' $\rightarrow$ ``rapidly'')~\citep{sijun_he}.
+
+%%% Local Variables:
+%%% mode: latex
+%%% TeX-master: "../../report"
+%%% End:
diff --git a/src/flyleaf.tex b/src/flyleaf.tex
new file mode 100755
index 0000000..8fbbe63
--- /dev/null
+++ b/src/flyleaf.tex
@@ -0,0 +1,82 @@
+\begin{titlepage}
+  \begin{center}
+
+    \textsc{\Large Master's Thesis at IDLab, Ghent University - imec}\\[1cm]
+
+    \includegraphics[width=.5\linewidth]{img/logo/heh-technical}\\[1cm]
+
+    \begin{vc}
+      \begingroup
+      \HRule\\
+      \flushleft
+      \Huge A Trip to Sesame Street: Evaluation of BERT and Other Recent Embedding Techniques Within RDF2Vec
+      \HRule\\
+      \endgroup
+    \end{vc}
+
+    \begin{minipage}[t]{0.4\textwidth}
+      \begin{flushleft} \large
+        \emph{Author:}\\
+        \hspace{0.1em}
+        Terencio \textsc{Agozzino}
+      \end{flushleft}
+    \end{minipage}
+    \hspace{5em}
+    \begin{minipage}[t]{0.4\textwidth}
+      \begin{flushleft} \large
+        \emph{Advisor:}\\
+        \hspace{0.1em}
+        Prof. Dr. Femke \textsc{Ongenae}
+      \end{flushleft}
+      \begin{flushleft} \large
+        \emph{Supervisors:}\\
+        \hspace{0.1em}
+        Dr. Ir. Gilles \textsc{Vandewiele}\\
+        \hspace{0.1em}
+        Dr. Ir. Samuel \textsc{Cremer}\\
+        \hspace{0.1em}
+        Ir. Bram \textsc{Steenwinckel}
+      \end{flushleft}
+    \end{minipage}\\[1cm]
+
+    \parbox{100mm}{\textit{Master's thesis submitted in fulfillment of the
+        requirements for the degree of Master in Industrial Engineering}}\\[1cm]
+
+    \begin{minipage}[c]{0.3\textwidth}
+      \centering
+      \includegraphics[width=.6\linewidth]{img/logo/imec}
+    \end{minipage}
+    \hfill
+    \begin{minipage}[c]{0.3\textwidth}
+      \centering
+      \includegraphics[width=.6\linewidth]{img/logo/ugent}
+    \end{minipage}
+    \hfill
+    \begin{minipage}[c]{0.3\textwidth}
+      \centering
+      \includegraphics[width=.6\linewidth]{img/logo/idlab}
+    \end{minipage}\\[1.5cm]
+    \begin{minipage}[c]{0.3\textwidth}
+      \centering
+      \includegraphics[width=.5\linewidth]{img/logo/wbe_vertical}
+    \end{minipage}
+    \hfill
+    \begin{minipage}[c]{0.3\textwidth}
+      \centering
+      \includegraphics[width=.5\linewidth]{img/logo/pole_hainuyer_horizontal}
+    \end{minipage}
+    \hfill
+    \begin{minipage}[c]{0.3\textwidth}
+      \centering
+      \includegraphics[width=.4\linewidth]{img/logo/cti}
+    \end{minipage}\\[1cm]
+
+    \large Academic year 2020-2021
+    \vfill
+  \end{center}
+\end{titlepage}
+
+%%% Local Variables:
+%%% mode: latex
+%%% TeX-master: "../report"
+%%% End:
diff --git a/src/introduction.tex b/src/introduction.tex
new file mode 100644
index 0000000..a18cd9d
--- /dev/null
+++ b/src/introduction.tex
@@ -0,0 +1,122 @@
+
+\chapter{Introduction}
+\label{chap:introduction}
+
+In an increasingly digital world, data generation applies different semantic and
+syntactic origins~\citep{DBLP:journals/jodsn/CeravoloAACCDMK18}. However, this
+data diversity must remain interpretable by the computer. In the absence of
+semantics, the evaluation of the validity of the data is more
+constraining. Without it, the same data point might not represent the same
+thing. For example, the value of a sensor may be correct in one context but an
+anomaly in another. Therefore, semantics makes it possible to make the context
+of these data precise and understand their relationships.
+
+The usage of the Resource Description Framework (RDF) standard of the World Wide
+Web Consortium (W3C) enables the semantic encoding of data. Such a standard
+allows the management of this diversity of data through the Semantic Web and
+Linked Open Data by interconnecting the different data sources. Disregarding the
+\emph{knowledge base}, which contains semantic information, one way to make this
+interconnection is to use graphs. A \emph{graph} is an ordered pair ($V, E$),
+where $V$ is a finite and non-empty set of elements called \emph{vertices} (or
+\emph{nodes}), and $E$ is a set of unordered pairs of distinct nodes of $V$,
+called \emph{edges}.
+
+Based on this alternative representation of RDF data, the concept of Knowledge
+Graph (KG) was published~\citep{singhal_2012}. Mathematically, a KG is a
+\emph{directed heterogeneous multigraph}. This graph can store multiple directed
+labeled edges between two nodes whose edges and nodes can be of different
+types. Additionally, a KG can unite various sources and enhance conventional
+data formats such as Comma Separated Values (CSV) by explicitly encoding the
+relations between various nodes. Due to the richness of relations that these
+types of graphs bring, several Machine Learning (ML) techniques can benefit from
+them. From then on, there is a restricted usage of KGs due to their symbolic
+constructs as most ML techniques require converting these KGs into numerical
+input feature vectors.
+
+During this decade, different techniques
+emerged~\citep{inproceedings:ristoski:strategies} to create these numerical
+feature vectors, called \emph{embeddings}. One of them is to use Resource
+Description Framework To Vector (RDF2Vec), an unsupervised and task-agnostic
+algorithm to numerically represent nodes of a
+KG~\citep{article:ristoski:rdf2vec} in an \emph{embedding matrix} used for
+downstream ML tasks. For this purpose, RDF2Vec uses a walking strategy to
+extract \emph{walks} of a KG, where a walk is an $n$-tuple of nodes starting
+with a root node. In addition to this strategy, it is possible to use a sampling
+strategy to better deal with larger KGs~\citep{inproceedings:cochez} by
+privileging some hops over others through edge weights. Once extracted, these
+walks are injected into an \emph{embedding technique} to learn the embeddings of
+the provided root nodes of a KG and generate the corresponding embedding matrix.
+
+Following the significant advances in Natural Language Processing (NLP), the
+community developed multiple embedding techniques. Among them, the release of
+Word2Vec in 2013 initially learns the vector representation of a
+word~\citep{inproceedings:mikolov}. However, KGs used this NLP technique where
+walks can be an analogy of sentences and nodes as words. Word2Vec is the default
+embedding technique of RDF2Vec and has shown promising
+results~\citep{DBLP:journals/semweb/RistoskiRNLP19} in its use with
+KGs. However, other significant advances in the NLP field have taken place.
+
+One of these advances was the creation of FastText, a Word2Vec extension to
+improve the obtained embeddings. Published in 2016, FastText has improved these
+embeddings in many use-cases at the expense of Random Access Memory (RAM)
+consumption and training time, but more recent techniques have emerged. Since
+2018, Bidirectional Encoder Representations from Transformers (BERT) is the
+state-of-the-art NLP technique~\citep{inproceedings:devlin} whose objective is
+to generate an unsupervised language representation model. This new technique
+outperformed Word2Vec in everyday NLP
+tasks~\citep{DBLP:conf/embc/SahaLG20,article:beseiso,DBLP:conf/acl/HendrycksLWDKS20}. Due
+to its efficiency, the community published many variants of BERT, each bringing
+its specificity. However, BERT's benefits with KGs are still debatable. Few
+research papers have used this technique in KGs to compare it with other
+embedding techniques.
+
+As a result of this finding, the main purpose of this Master's thesis is to
+provide a research work to focus on evaluating BERT with KGs based on Word2Vec
+and FastText. Such an evaluation would help determine its impact on the created
+embeddings and improve the generation of a 2D feature matrix from a KG.
+
+To achieve this evaluation, an implementation of BERT and FastText will have to
+be made to the \texttt{pyRDF2Vec} library, a central Python implementation of
+the Java-based version of the RDF2Vec algorithm~\citep{pyrdf2vec}. In addition,
+since BERT works differently from Word2Vec, and FastText, it will be necessary
+to adapt the extraction and formatting of the walks of a KG to improve its
+training. Finally, the choice of hyperparameters will be a determining criterion
+on the accuracy of the BERT model, so it may be wise to find a compromise
+between training time and the model's accuracy.
+
+Besides this primary objective, this Master's thesis proposes one more walking
+strategy and sampling strategy to those already provided by
+\texttt{pyRDF2Vec}~\citep{inproceedings:cochez}. Specifically, a walking
+strategy called \texttt{SplitWalker} uses a splitting function to pre-format the
+extracted walks before being injected into an embedding technique. In addition
+to this strategy, the \texttt{WideSampler} sampling strategy focuses on features
+shared between several entities by maximizing common relationships. Such
+additions could improve a model's accuracy and the extraction time for specific
+use cases according to a user's needs.
+
+The result of this Master's thesis work is structured as follows. Chapter
+\ref{chap:related:work} introduces more context on the work related to BERT and
+RDF2Vec with KGs. Chapter \ref{chap:objectives} focuses on providing the
+specifications for this Master's thesis and stating the problems
+encountered. Chapter \ref{chap:background} provides background information on
+the fundamental concepts of graphs, ML, and NLP. This chapter also introduces
+advanced concepts such as the Attention mechanism and the Transformer
+architecture, both used by BERT. In addition, Chapter
+\ref{chap:embedding:techniques} focuses on covering in detail the functioning of
+each embedding technique, dedicating a section to the adaptation of BERT with a
+graph. From then on, Chapter \ref{chap:rdf2vec} refers to RDF2Vec, including the
+walking strategies and the sampling strategies proposed by this Master's
+thesis. Chapter \ref{chap:work:performed} describes the work done behind this
+Master's thesis, explaining the different implementations that were put in
+place. Chapter \ref{chap:benchmarks} focuses on providing the setup and results
+obtained from the different benchmarks related to the embedding techniques,
+walking and sampling strategies. Following these results, Chapter
+\ref{chap:discussion} discusses the correlation of the results with those of
+other research papers and provides some leads for future research with
+BERT. Finally, Chapter \ref{chap:conclusion} is dedicated to the conclusion of
+this Master's thesis, summarizing the issues and solutions proposed.
+
+%%% Local Variables:
+%%% mode: latex
+%%% TeX-master: "../report"
+%%% End:
diff --git a/src/objectives.tex b/src/objectives.tex
new file mode 100644
index 0000000..c8fbb80
--- /dev/null
+++ b/src/objectives.tex
@@ -0,0 +1,118 @@
+
+\chapter{Objectives}
+\label{chap:objectives}
+
+This chapter covers the specifications expected by this Master's thesis and the
+problems faced in its elaboration. In addition, each specification is followed
+by a summary of what was done to get a better idea of the overall report.
+
+\section{Specifications}
+\label{sec:objectives:specifications}
+
+In 2018, BERT was released and outperformed all other existing techniques on
+various ML tasks related to various fields. This Master's thesis aims to
+integrate BERT into
+\texttt{pyRDF2Vec}\footnote{\url{https://github.com/IBCNServices/pyRDF2Vec/}},
+but more specifically to evaluate its impact in terms of runtime, predictive
+performance, and memory usage using different combinations of walking and
+sampling strategies. Moreover, by reading literature from general graph-based ML
+and NLP research, inspiration can be found to design new sampling and walking
+strategies that result in improved predictive performances by exploiting the
+properties of BERT. The specifications of this Master's thesis are composed of
+the following four points:
+\begin{enumerate}
+\item \textbf{Support BERT and other embedding techniques for comparison
+purposes}: at least the BERT embedding technique should be integrated into
+\texttt{pyRDF2Vec} and be compared to the Word2Vec technique, which is currently
+used. Additionally, the extension of Word2Vec, namely FastText, will also have
+to be implemented and be compared to know its impact.
+
+For this Master's thesis, BERT and FastText have been implemented and integrated
+within the \texttt{pyRDF2Vec} library. Added to this implementation is a new
+architecture for \texttt{pyRDF2Vec} that easily allows other embedding
+techniques. Since Word2Vec already exists in the library, this Master thesis
+proposed a better walk extraction to improve the model's accuracy.
+
+\item \textbf{Evaluate the impact of BERT}: A thorough benchmarking study to
+evaluate the impact of the BERT embedding technique on different dimensions will
+have to be conducted. For this, data sets having different properties and
+stemming from different domains will have to be selected. Moreover, a rigorous
+framework that performs the required experiments and logs all of these results
+will have to be developed.
+
+For this Master's thesis, BERT, Word2Vec, and FastText are compared on
+\texttt{MUTAG}, a small KG. In addition to these benchmarks of embedding
+techniques, the \texttt{pyRDF2Vec} library was modernized, allowing to get more
+information with the \texttt{verbose} parameter of the
+\texttt{RDF2VecTransformer} class.
+
+\newpage
+
+\item \textbf{Support of new walking strategies for RDF2Vec}: while an initial
+set of five simple walking strategies are already implemented in
+\texttt{pyRDF2Vec}, many other alternatives exist. Implementing more walking
+strategies would allow more extensive and detailed comparisons for embedding
+techniques.
+
+For this Master's thesis, the \texttt{SplitWalker} walking strategy is proposed and
+implicitly introduced some of its variants. Finally, this Master's thesis offers
+some benchmarks related to \texttt{SplitWalker} and other existing walking strategies.
+
+\item \textbf{Support of new sampling strategies for RDF2Vec}: similarly, other
+sampling strategies can be created and implemented in
+\texttt{pyRDF2Vec}. Therefore, it will be possible to see the impact of the
+choice of a walking strategy and a sampling strategy with the performances
+related to BERT and other embeddings techniques.
+
+For this Master's thesis, the \texttt{WideSampler} walking strategy is proposed,
+leading to other variants of the latter. In addition, this Master's thesis
+evaluates \texttt{WideSampler} with other existing sampling strategies to knows
+its impact.
+\end{enumerate}
+
+\section{Problems Encountered}
+\label{sec:objectives:problems}
+
+This section discusses the problems that have been encountered and why some
+benchmarks are missing. Initially, benchmarks related to bigger KGs such as
+\texttt{AM} and \texttt{DBP:Cities} should have been done. Therefore, it is
+helpful to understand the reasons for this to prevent these errors from
+recurring.
+
+\subsection{Garbage Collector}
+\label{subsec:garbage:collector}
+
+This Master's thesis being coupled with the internship, the same servers were
+used. Due to an internal problem with IDLab's server infrastructure, a loss of
+time equivalent to one and a half months of work was made. Therefore, this loss
+of time had repercussions on this Master's thesis. Specifically, the benchmarks
+related to BERT are not presented for this version of the report. However, they
+will be added for the next version. Moreover, some benchmarks had to be
+shortened due to time constraints.
+
+For more information related to this issue, IDLab servers use the
+Stardog\footnote{\url{https://www.stardog.com/}} infrastructure, which is
+implemented in Java. Therefore, the different benchmarks for the internship and
+Master's thesis recorded high variants. After several weeks of debugging,
+thinking that this was a \texttt{pyRDF2Vec} issue, a Stardog engineer confirmed
+that the issue was related to their infrastructure. In more detail, the concern
+was the lack of RAM release from the servers due to garbage collection. As a
+result, new data was being saved directly to a physical disk, which resulted in
+much higher latencies than RAM. From then on, this was reflected in the variance
+of the results.
+
+\subsection{Exceptions Raised}
+\label{subsec:exception:raised}
+
+In addition to the problems related to the garbage collector, exceptions are
+encountered from time to time during SPARQL Protocol and RDF Query Language
+(SPARQL) queries. These exceptions are due to the inaccessible locally hosted
+SPARQL endpoint during updates or internal network problems. As a result, after
+more than three attempts to retrieve the walks of a provided entity, an
+exception is thrown, which makes the benchmarks stop. Since these benchmarks can
+take several days, it also delayed other benchmarks.
+
+%%% Local Variables:
+%%% mode: latex
+%%% TeX-master: "latex"
+%%% End:
diff --git a/src/rdf2vec.tex b/src/rdf2vec.tex
new file mode 100644
index 0000000..c2c99da
--- /dev/null
+++ b/src/rdf2vec.tex
@@ -0,0 +1,225 @@
+
+\chapter{RDF2Vec}
+\label{chap:rdf2vec}
+
+Resource Description Framework To Vector (RDF2Vec) is an unsupervised and
+task-agnostic algorithm to numerically represent nodes of a KG in an embedding
+matrix that downstream ML tasks can use. RDF2Vec is unsupervised as it only
+relies on the neighborhood of an entity to create embeddings and therefore does
+not require any information about the node labels. Precisely, the dimensions
+(e.g., $K \times 500$) of this embedding matrix result from the walk extraction
+(e.g., $K$) and vector size (e.g., 500).
+
+\section{Walk Extraction}
+\label{sec:rdf2vec:walk:extraction}
+
+This algorithm starts using a \emph{walking strategy} on a provided KG to
+extract these walks, collecting an $n$-tuple of nodes starting with a root node
+following a sequence of predicates and objects. Using the similarities shared by
+natural language and graphs, these walks then serve as sentences for existing
+NLP techniques, such as Word2Vec, to learn the embeddings of the root nodes of
+this KG provided by a user.
+
+\begin{figure}[!ht]
+  \centering
+  \subfloat[Step I]{
+    \begin{tikzpicture}[minimum size=2cm,node distance=1cm]
+      \node[entity,fill=myblue,font=\large,scale=0.6] (john) {\textsc{John}};
+      \node[entity,font=\large,left=of john,scale=0.6] (friendOf){friendOf};
+      \node[entity,font=\large,left=of friendOf,scale=0.6] (smith) {\textsc{Smith}};
+      \node[entity,font=\large,right=of john,scale=0.6] (hasCat) {hasCat};
+      \node[entity,font=\large,right=of hasCat,scale=0.6] (felix) {\textsc{Felix}};
+
+      \draw[arrow] (john) -- node[midway,yshift=0.4cm] {\small{2}} (hasCat);
+      \draw[arrow] (hasCat) -- node[midway,yshift=0.4cm] {\small{1}} (felix);
+      \draw[arrow] (john) -- node[midway,yshift=0.4cm] {\small{1}} (friendOf);
+      \draw[arrow] (friendOf) -- node[midway,yshift=0.4cm] {\small{1}} (smith);
+    \end{tikzpicture}
+  }
+  \vfill
+    \subfloat[Step II]{
+    \begin{tikzpicture}[minimum size=2cm,node distance=1cm]
+      \node[entity,font=\large,scale=0.6] (john) {\textsc{John}};
+      \node[entity,font=\large,left=of john,scale=0.6] (friendOf){friendOf};
+      \node[entity,font=\large,left=of friendOf,scale=0.6] (smith) {\textsc{Smith}};
+      \node[entity,fill=myblue,font=\large,right=of john,scale=0.6] (hasCat) {hasCat};
+      \node[entity,font=\large,right=of hasCat,scale=0.6] (felix) {\textsc{Felix}};
+
+      \draw[arrow] (john) -- node[midway,yshift=0.4cm] {\small{2}} (hasCat);
+      \draw[arrow] (hasCat) -- node[midway,yshift=0.4cm] {\small{1}} (felix);
+      \draw[arrow] (john) -- node[midway,yshift=0.4cm] {\small{1}} (friendOf);
+      \draw[arrow] (friendOf) -- node[midway,yshift=0.4cm] {\small{1}} (smith);
+    \end{tikzpicture}
+  }
+  \vfill
+  \subfloat[Step III]{
+    \begin{tikzpicture}[minimum size=2cm,node distance=1cm]
+      \node[entity,font=\large,scale=0.6] (john) {\textsc{John}};
+      \node[entity,font=\large,left=of john,scale=0.6] (friendOf){friendOf};
+      \node[entity,font=\large,left=of friendOf,scale=0.6] (smith) {\textsc{Smith}};
+      \node[entity,font=\large,right=of john,scale=0.6] (hasCat) {hasCat};
+      \node[entity,fill=myblue,font=\large,right=of hasCat,scale=0.6] (felix) {\textsc{Felix}};
+
+      \draw[arrow] (john) -- node[midway,yshift=0.4cm] {\small{2}} (hasCat);
+      \draw[arrow] (hasCat) -- node[midway,yshift=0.4cm] {\small{1}} (felix);
+      \draw[arrow] (john) -- node[midway,yshift=0.4cm] {\small{1}} (friendOf);
+      \draw[arrow] (friendOf) -- node[midway,yshift=0.4cm] {\small{1}} (smith);
+    \end{tikzpicture}
+  }
+  \caption{Walk Extraction for an Oriented Graph.}
+  \label{fig:rdf2vec:walk:extraction}
+\end{figure}
+
+In Figure \ref{fig:rdf2vec:walk:extraction}, a KG is composed of five nodes and
+four edges. Each edge is related to a weight determined by a \emph{sampling
+strategy} that can assign these weights either randomly or guided by a
+particular metric, called \emph{biased walks}. These edge weights are helpful to
+the walking strategy to identify the following neighboring entity to extract
+in a walk. The walk extraction starts with \texttt{John} as the root node,
+including \texttt{friendOf} and \texttt{hasCat} as possible candidates for the
+next hop. However, as the \texttt{hasCat} edge has a higher weight than the
+\texttt{friendOf} edge, the hop to \texttt{hasCat} is preferred. After this hop,
+the walking strategy updates its list of candidates with the neighbors of the
+current node. Finally, the process continues to iterates until this walking
+strategy returns an exhaustive list of walks or reaches a specified predefined
+depth covering the number of successive tuples within a walk. From then on,
+there is the extraction of the 3-tuple (\texttt{John}, \texttt{hasCat},
+\texttt{Felix}) walk.
+
+The original RDF2Vec implementation uses a random walking strategy to extract a
+limited number. The particularity of this walking strategy implies the
+extraction of a random hop\footnote{Originally RDF2Vec preferred a random walk.}
+when neighboring hops have the same weight. The random walking strategy applies
+two algorithms to extract walks: the Depth-first search (DFS) and Breadth-first
+search (BFS). The former traverses a graph as deep as possible before retracing
+its steps. In contrast, the latter crosses every neighboring node of the same
+depth before crossing those of a different depth. Therefore, if there is a
+necessity to extract a specific number of walks, DFS is used by this
+strategy. Otherwise, the random walker strategy picks BFS to extract every walks
+of a KG.
+
+\subsection{Walking Strategies}
+\label{subsec:rdf2vec::walking:strategies}
+
+After the publication of RDF2Vec, several walking strategies became
+available~\citep{inproceedings:cochez}, where every walking strategy can be an
+extraction (\texttt{type 1}) or transformation (\texttt{type 2}) technique
+~\citep{article:vandewiele}. Each of these strategies brings its particularity,
+making it preferable to choose them in at least one use case.
+
+As the name implies, extraction techniques focus on extracting walks, usually so
+that these walks provide richer information to produce a model resulting in the
+highest possible accuracy. This category includes random walking strategy and
+\emph{Community Hops}. Based on relationships not explicitly modeled in a KG,
+the latter groups' nodes with similar properties through community detection. However,
+both walking strategies rely on the BFS and DFS algorithms, including or not
+some variations.
+
+The transformation techniques categorize the walking strategies that transform
+the extracted walks provided by a \texttt{type 1} walking strategy. As they are
+easier to implement, this type includes more walking strategies than
+\texttt{type 1}. This technique's primary purpose is to define
+\emph{one-to-many} or \emph{many-to-one} cardinality between the old node's
+labels and the new ones. If there is a \texttt{one-to-one} cardinality, no
+additional information is gained and the original walking strategy could be
+used.
+
+In a non-exhaustive way, the following walking strategies of \texttt{type 2}
+rely on the transformation of randomly extracted walks:
+\begin{itemize}
+\item \textbf{Anonymous Walk}: transforms each vertex name other than the root
+node into positional information to anonymize the randomly extracted walks.
+
+\item \textbf{Hierarchical Random Walks} (\textbf{HALK}): removes rare hops from randomly
+extracted walks, increasing the quality of the generated embeddings while
+reducing memory usage.
+
+With this strategy, the suppression of a walk occurs when this walk only
+contains the root node following one or more infrequent hop(s), as it will not
+provide additional information.
+
+\item \textbf{N-Gram}: transforms the $n$-grams in random walks to define a
+mapping from \emph{one-to-many}. The intuition behind this strategy is that the
+predecessors of a node that two different walks have in common can be different.
+
+\item \textbf{Walklets}: transforms randomly extracted walks into walklets which
+are walks of size one or two, including the root node and potentially another
+node that can be a predicate or an object.
+
+\item \textbf{Weisfeiler-Lehman}: transforms the nodes of the extracted random
+walks, providing additional information about the entity representations only
+when a maximum number of walks is not specified.
+\end{itemize}
+
+However, the implementations of these sampling strategies in \texttt{pyRDF2Vec}
+relied on extracting child nodes, not parent nodes, which lost context for a
+root node.
+
+\subsection{Sampling Strategies}
+\label{subsec:rdf2vec:sampling:strategies}
+
+The sampling strategies essentially allow to better deal with larger KGs. A
+naive implementation randomly samples a fixed number of walks for each entity to
+keep the total number of walks limited. Since then, the
+community~\citep{inproceedings:cochez,article:mukherjee,article:taweel} has
+suggested several metrics to compute the sampling weights while walking.
+
+Although sampling strategies allow the extraction of large KGs, most of these
+strategies require working on the entire KG to assign weights to edges. However,
+some KGs are so large that they need to be stored in a
+\emph{triplestore}\footnote{Database designed for the storage and retrieval of
+RDF.} and made available through a SPARQL endpoint. Therefore, one way to use
+these sampling strategies is to load parts of large KGs, assign weights, and
+start the walk extraction. Finally, the different walks extracted for these
+parts of KGs would be concatenated and returned.
+
+\section{Shortcomings}
+\label{sec:rdf2vec:shortcomings}
+
+The node representation made by RDF2Vec has already achieved great predictive
+performances on several data sets in various fields. However, RDF2Vec still has
+a few shortcomings. In a non-exhaustive way:
+\begin{itemize}
+\item \textbf{RDF2Vec does not scale to large KGs}: the walk extraction grows
+exponentially with the predefined depth. This behavior is unacceptable with KGs
+containing many nodes, mainly when these KGs contain many highly connected
+nodes.
+
+\item \textbf{RDF2Vec cannot deal with literals in the KG}: this algorithm
+extracts node as \emph{non-ordinal categorical} data, which discard a
+considerable amount of rich information that \emph{literals} can provide. In
+other words, nodes can be from different types, but RDF2Vec extracts these nodes
+as a name without any classification instead of conserving their types.
+
+\item \textbf{RDF2Vec cannot deal with dynamic graphs}: adding a new entity in a
+KG implies re-training the model generated by the embedding technique. This
+re-training is undesirable, especially when the training time of a model is
+substantial.
+
+\item \textbf{RDF2Vec uses a simple data structure for storing walks}:
+Extracting more complex data structures, such as trees, or modifying the walking
+algorithm to introduce different inductive \emph{biases} could result in higher
+quality embeddings. Consequently, these quality embeddings would improve the
+model's accuracy.
+
+\item \textbf{RDF2Vec uses an embedding technique that is no longer state of the
+art in NLP}: currently, RDF2Vec uses Word2Vec as an embedding
+technique. However, more recent NLP techniques such as BERT could be a better
+alternative to Word2Vec and improve the model's accuracy.
+\end{itemize}
+
+The community has proposed solutions to address the shortcomings mentioned above
+to improve this representation of nodes. Among these, optimization mechanisms
+(e.g., caching and multiprocessing) to better handle large KGs and an
+\emph{online learning} implementation to update the vocabulary of nodes learned
+by RDF2Vec. In addition, a user could extract interesting literals by specifying
+a sequence of predicates followed by a walking strategy. Finally, other
+embedding techniques than Word2Vec were also proposed, such as
+\emph{KGloVe}\footnote{\url{https://datalab.rwth-aachen.de/embedding/KGloVe/}},
+which uses the Global Vectors for Word Representation (GloVe) embedding
+technique.
+
+%%% Local Variables:
+%%% mode: latex
+%%% TeX-master: "latex"
+%%% End:
diff --git a/src/references.tex b/src/references.tex
new file mode 100644
index 0000000..97bcec9
--- /dev/null
+++ b/src/references.tex
@@ -0,0 +1,7 @@
+
+\bibliography{bibli}
+
+%%% Local Variables:
+%%% mode: latex
+%%% TeX-master: "../report"
+%%% End:
diff --git a/src/related-work.tex b/src/related-work.tex
new file mode 100644
index 0000000..3d0c652
--- /dev/null
+++ b/src/related-work.tex
@@ -0,0 +1,122 @@
+
+\chapter{Related Work}
+\label{chap:related:work}
+
+This chapter presents the related work around BERT and RDF2Vec with KGs to
+understand better the study done by this Master's thesis.
+
+\section{BERT}
+\label{section:related:work:bert}
+
+Over the last few years after the publication of
+BERT~\citep{inproceedings:devlin}, several variants of this model emerged and
+were evaluated with its original version. One of these variants is DistilBERT, a
+smaller, cheaper, and lighter version of BERT. The comparison of DistilBERT with
+BERT and ELMo, another embedding technique, showed promising results across
+several data sets. Specifically, DistilBERT~\citep{distilbert} had almost as
+excellent performance as BERT surpassing ELMo's score in most data sets.
+
+In the NLP related to biomedical and clinical texts, a benchmark of five tasks
+with ten datasets of different sizes and difficulties also compared BERT and
+ELMo~\citep{peng}. In this benchmark, BERT showed better overall results
+compared to ELMo. Another evaluation provides lower accuracy for BERT than GloVe
+when using a dataset related to tweets with or without
+sarcasm~\citep{khatri}. According to the authors, these lower BERT scores are
+due to the lack of context provided by the sarcastic tweets. Finally, another
+benchmark is interested in comparing BERT with ELMo, GloVe, and FastText by
+performing principal component static embeddings~\citep{ethayarajh}. In this use
+case, BERT shows the best results in most data sets.
+
+Meanwhile, the community has released other KG-oriented variants of BERT, such
+as KG-BERT, released in 2019. This BERT-based framework achieves
+state-of-the-art performance in triple classification and link and relationship
+prediction tasks~~\citep{DBLP:journals/corr/abs-1909-03193}. Shortly after the
+publication of KG-BERT, K-BERT is also released, which has the particularity of
+not using pre-training. K-BERT uses BERT to infer relevant knowledge in a domain
+text to solve the lack of domain-specific knowledge within a general language
+representation from large-scale corpora~\citep{DBLP:conf/aaai/LiuZ0WJD020}. In
+this paper, K-BERT significantly outperforms BERT in NLP tasks from different
+fields such as finance, law, and medicine.
+
+After the release of KG-BERT and K-BERT, others variants followed, such as
+Graph-BERT published in 2020. Graph-BERT relies solely on the Attention
+mechanism without using graph convolution or aggregation operations. This
+KG-variant also shows promising results overcoming Graph Neural Networks (GNNs)
+in the learning efficiency for node classification and graph clustering
+tasks~\citep{zhang}.
+
+BERT-INT is also another variation of BERT, which predicts the identity of two
+entities across multiple KGs. BERT-INT works on proximity information to predict
+such identity without relying on the KG structure
+itself~\citep{DBLP:conf/ijcai/Tang0C00L20}. Precisely, BERT-INT mimics entity
+comparison as humans would by comparing their name, description, and
+attributes. When these two entities share the same information, a second
+comparison consists of assessing the similarity of neighbors between entities,
+but this time based on their name and description only.
+
+Finally, the last paper uses BERT with Transfer Learning to answer questions
+based on a domain-specific KG~\citep{DBLP:conf/aics/VegupattiNC20}. Specifically
+in the field of medical biology using a dataset of 600 questions. In this paper,
+BERT succeeded in answering these questions with more than acceptable results.
+
+The discoveries made around BERT suggest that the usage of this embedding
+technique with KGs is possible. However, too few research papers compare BERT
+with other embedding techniques. This lack of articles makes it unclear what the
+context of BERT's application is. The work of \textsc{Khatri} et al. gives a
+lead, but it is not enough.
+
+\section{Graph-Based Machine Learning}
+\label{section:related:work:graph}
+
+From a data modeling perspective, the use of KG since its publication has become
+a standard in the Semantic Web. This specific type of graph has its use in many
+fields to model a knowledge domain. However, despite the valuable information
+such a graph can provide, an ML model cannot directly learn from
+them~\citep{article:ristoski:rdf2vec}. Therefore, this decade proposed several
+techniques to convert KGs into embeddings for downstream ML tasks. In the field
+of KG embeddings, three categories of embedding creations are distinguished:
+\emph{direct encoding} based, Deep Learning (DL) based, and \emph{path/walk}
+based.
+
+The first category includes algorithms such as RESCAL and Translating Embeddings
+(TransE) that mainly stand out in tasks related to graph completion as well as
+link prediction and entity classification, also called \emph{node
+classification}~\citep{DBLP:conf/nips/BordesUGWY13}. It is still possible to
+perform mathematical operations with direct encoding while remaining the
+\emph{embedded space} node classification result. This embedded space ensures
+the data embedding after dimensionality reduction. However, two of the drawbacks
+of direct encoding are their limited use with dynamic graphs and support for
+literals.
+
+Modeling Relational Data with Graph Convolutional Networks (R-GCNs) mainly
+dominates the second category. R-GCNs is a supervised algorithm published in
+2017 working directly on graphs. This algorithm illustrates DL's power in the
+Statistical Relational Learning tasks as link prediction and entity
+classification~\citep{DBLP:conf/esws/SchlichtkrullKB18}. However, a significant
+disadvantage of R-GCNs is their memory consumption due to loading the KG
+upstream, limiting its use to KGs of reasonable size. Another drawback concerns
+their dependency on manual labeling of the training datasets due to the
+\emph{supervised learning}, which can be time-consuming depending on its
+size. Besides that, the support of literals is theoretically possible, but only
+a few studies have investigated the subject.
+
+The last category includes RDF2Vec, which is the state-of-the-art unsupervised
+algorithm since 2016. Unlike the other two categories, in this category, the
+creation of embeddings is done by traversing a KG using a walking and sampling
+strategy. Therefore, an essential drawback of RDF2Vec is that without caching
+and other optimization mechanisms, the walk extraction time can be significant
+for large KGs.
+
+Each of these three categories of algorithms ensures the generation of
+embeddings. However, according to the use case, one category may be preferred to
+another. For example, since R-GCNs explicitly remove edges in a KG before
+training a model, RDF2Vec could be preferred. Furthermore, RDF2Vec can decide
+whether or not to prioritize a hop due to its walking and sampling strategy,
+which avoids training a model over the whole KG. Finally, these categories can
+support an online learning implementation and guarantee that a model remains up
+to date.
+
+%%% Local Variables:
+%%% mode: latex
+%%% TeX-master: "latex"
+%%% End:
diff --git a/src/work-performed.tex b/src/work-performed.tex
new file mode 100644
index 0000000..1eb2547
--- /dev/null
+++ b/src/work-performed.tex
@@ -0,0 +1,649 @@
+
+\chapter{Work Performed}
+\label{chap:work:performed}
+
+This chapter is dedicated to the solutions provided by this Master's
+thesis. These solutions include:
+\begin{itemize}
+\item \textbf{Improving the accuracy of the Word2Vec model}: using a user-defined
+depth to extract the child and parent nodes of a current node for each walk. In
+addition, the centralization of the root node in the extracted walks
+maximizes the number of training samples containing this root node and therefore
+generates better quality embeddings.
+\item \textbf{A BERT implementation}: builds its vocabulary only based on
+special tokens and single extracted nodes, not allowing WordPiece splitting in
+the tokenization of nodes. Finally, this version of BERT only considers
+the MLM as a pre-training step, with training that tends to find a trade-off
+between its time and the accuracy that this model will generate.
+\item \textbf{A FastText implementation}: unlike its original version, its
+implementation does not allow splitting $n$-grams on a defined minimum and maximum
+length, but only according to a splitting function provided by a user. In
+addition, the reimplementation of the Cython $n$-grams computation function in
+Python to facilitate the use of \texttt{pyRDF2Vec} on Google
+Colaboratory\footnote{Website that allows to run code on Google's cloud
+servers.}.
+\item \textbf{WideSampler}: sampling strategy that maximizes the extraction of
+  shared features between entities.
+\item \textbf{SplitWalker}: walking strategy that extracts walks according to a
+  splitting function.
+\item \textbf{A better architecture}: the \texttt{pyRDF2Vec} library now allows
+to easily add walking strategies, sampling strategies, and embedding techniques
+by reimplementing only a few functions.
+\end{itemize}
+
+Finally, this Master's thesis implicitly proposes some research for future work
+related to BERT. Among these, the injection of \texttt{(subject, object)}
+2-tuples in BERT instead of two walks. Such an injection would allow BERT to
+focus to predict predicates instead of seeing correlations between two different
+walks.
+
+\section{Improving the Accuracy of the Word2Vec Model}
+\label{sec:work:performed:word2vec}
+
+To better understand how it is possible to improve the model's accuracy
+generated by Word2Vec, it is helpful to consider the initial problem with the
+walk extraction. With RDF2Vec, the transformation for a $n$-tuple without any walks
+limitation and concerning {\footnotesize\texttt{"URL\#Alice"}} as the root node
+can be achieved with each walking strategy as follows:
+\begin{table}[!ht]
+  \centering
+  \begin{threeparttable}[t]
+    \centering
+    \resizebox{\textwidth}{!}{%
+      \begin{tabular}{cll}
+        \toprule
+        \textbf{Walking Strategy} & \textbf{Initial/Transformed $\mathbf{n}$-tuple} \\
+        \midrule
+        \multirow{2}{*}{Anonymous Walk} & {\footnotesize\texttt{("URL\#Alice", "URL\#knows","URL\#Bob")}} \\
+        & {\footnotesize\texttt{("URL\#Alice", "1", "2")}} \\
+        \midrule
+        \multirow{2}{*}{HALK} & {\footnotesize\texttt{("URL\#Alice", "URL\#knows", "URL\#Bob")}, \texttt{("URL\#Alice", "URL\#loves", "URL\#Bob")},} $\dotsc$ \\
+        & {\footnotesize\texttt{("URL\#Alice", "URL\#knows","URL\#Bob")}}\tnote{\textcolor{blueLink}{1}} \\
+        \midrule
+        \multirow{2}{*}{N-Gram} & {\footnotesize\texttt{("URL\#Alice", "URL\#knows", "URL\#Bob")}, \texttt{("URL\#Alice", "URL\#loves", "URL\#Dean")}} \\
+        & {\footnotesize\texttt{("URL\#Alice", "URL\#knows", "0")}, \texttt{("URL\#Alice", "URL\#loves", "1")}}\tnote{\textcolor{blueLink}{2}} \\
+        \midrule
+        \multirow{2}{*}{Walklets} & {\footnotesize\texttt{("URL\#Alice", "URL\#knows","URL\#Bob")}} \\
+        & {\footnotesize\texttt{("URL\#Alice", "URL\#knows")}, \texttt{("URL\#Alice", "URL\#Bob")}} \\
+        \midrule
+        \multirow{2}{*}{Weisfeiler-Lehman} & {\footnotesize\texttt{("URL\#Alice", "URL\#knows","URL\#Bob")}} \\
+        & {\footnotesize\texttt{("URL\#Alice", "URL\#knows", "URL\#Bob")}, \texttt{("URL\#Alice", "URL\#knows", "URL\#Bob-URL\#knows")}}\tnote{\textcolor{blueLink}{3}} \\
+        \bottomrule
+      \end{tabular}
+    }%
+    \begin{tablenotes}
+    \item[1]
+      Assuming a minor threshold frequency and an infrequent
+      {\footnotesize\texttt{"URL\#loves"}} hop compared to \\ other hops.
+    \item[2] Assuming at least one gram to relabel. If grams $> 2$, every object
+      names will remain \\ identical for this example.
+    \item[3] Assuming a Weisfeiler Lehman iteration of 1.
+    \end{tablenotes}
+  \end{threeparttable}%
+  \caption{Example of $n$-tuple Transformation for Type 2 Walking Strategies.}
+  \label{tab:walking:strategies}
+\end{table}
+
+The walking strategies in Table \ref{tab:walking:strategies} already give good
+results. However, the {\footnotesize\texttt{"URL\#Alice"}} root node is always
+positioned at the beginning of each walk, reducing the richness of these walks
+for embedding techniques like Word2Vec, which works with window size. As the
+solicitation of nodes placed in the middle of a walk is higher due to selecting
+context words, including the root node as close as possible to the middle for
+each walk could generate better quality embeddings. Indeed, much more training
+sampling would contain this root node.
+
+In addition to this positioning issue of the root node, \texttt{type 1} walking
+strategies have the main disadvantage of continuously extracting child nodes of
+a root node. In other words, the embedding techniques never know the root node's
+parents. An alternative would be to extract the whole child and parent nodes of
+a root node. Such extraction maximizes the extracted walk information and,
+therefore, improves a model's accuracy after being trained with an embedding
+technique. However, this alternative is not suitable when extracted walks are
+limited, mainly used to process large KGs. Furthermore, when it comes to
+positioning, the extraction of parent nodes from a root node also implies a
+wrong positioning. Specifically, this root node would be this time placing as
+the last node of walks, preceded by a sequence of predicates and objects, which
+is not desirable.
+
+Based on these constraints, this Master's thesis proposes to apply a
+user-defined depth to extract the child and parent nodes of a current node for
+each walk. Assuming that the root node has parent nodes, each extracted walk
+will have a better position for this root node. Indeed, each root node will be
+preceded by a part of its parent nodes and succeeded by a part of its child
+nodes.
+
+In Table \ref{tab:window:size}, assuming the sentence ``I will always remember
+her'' and considering the word ``I'' as the root node of a walk, the latter is
+only included twice in the context words. However, suppose now this word is
+positioned in the center instead of the word "always". In that case, its
+frequency of occurrence can be doubled, i.e., from two to four times for this
+example. As a result, the embeddings generated for the different root nodes are
+of better quality by the number of occurrence of root nodes and by the context
+provided.
+
+\section{BERT Implementation}
+\label{sec:bert:implementation}
+
+The recommended library to implement BERT is
+\texttt{huggingface/transformers}\footnote{\url{https://huggingface.co/transformers/}}
+which provides many pre-trained
+models\footnote{\url{https://huggingface.co/models}} and essential functions to
+create a model. As no pre-training model exists for BERT with KGs, it is
+necessary to create this model from scratch, which requires three main steps:
+\begin{enumerate}
+\item \textbf{Build the vocabulary}: based on the nodes, including one line per
+  special token and one line per unique node, as part of a KG.
+\item \textbf{Fits the BERT model}: based on the corpus of walks provided, including three main goals:
+  \begin{multicols}{2}
+    \begin{enumerate}
+    \item \textbf{Node tokenization}: in such a way that special tokens are
+      inserted on both sides of the nodes, taking care not to split them. Which unlike
+      words, splitting a URI is not desired.
+    \item \textbf{Pre-training}: only done with MLM, as the NSP pre-training
+      task is not helpful since the walks do not share any continuity.
+    \item \textbf{Training}: is done by providing a training set of formatted
+      walks, a data collator (e.g., \texttt{DataCollatorForLanguageModeling}), and the
+      training parameters. The walk formatting is necessary to ensure added
+      padding, truncating walks that are too long ($\geq$ 512 characters).
+    \end{enumerate}
+  \end{multicols}
+\item \textbf{Transforms the provided entities into embeddings}: returns entity embeddings.
+\end{enumerate}
+
+\begin{lstlisting}[caption=Creation of the Walk Data Set.,label=bert:walk:dataset]
+class WalkDataset(Dataset):
+  def __init__(self, corpus, tokenizer):
+    self.walks = [
+      tokenizer(
+        " ".join(walk), padding=True, truncation=True, max_length=512
+      )
+      for walk in corpus
+    ]
+
+  def __len__(self):
+    return len(self.walks)
+
+  def __getitem__(self, i):
+    return self.walks[i]
+\end{lstlisting}
+
+In Algorithm \ref{bert:walk:dataset}, creating the walks data set for BERT
+is done by creating a dedicated class. Within this class, tokenization is necessary.
+Each walk is truncated to 512 characters, followed by padding to handle
+walks of the same size. Finally, it is also useful to implement the
+Dunder\footnote{Also called \emph{magic} methods.}  \texttt{\_\_len\_\_} and
+\texttt{\_\_getitem\_\_} methods to define the size of a sample of training data
+and for the fetching of this sample.
+
+\begin{lstlisting}[caption=Creation of Node Vocabulary.,label=bert:node:vocabulary]
+def _build_vocabulary(self, nodes, is_update = False):
+  with open(self.vocab_filename, "w") as f:
+    if not is_update:
+      for token in ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]:
+        f.write(f"{token}\n")
+        self._vocabulary_size += 1
+    for node in nodes:
+      f.write(f"{node}\n")
+      self._vocabulary_size += 1
+\end{lstlisting}
+
+In Algorithm \ref{bert:node:vocabulary}, each walk node is stored in a file with
+special tokens at the beginning of the file, namely: \texttt{[PAD]},
+\texttt{[UNK]}, \texttt{[CLS]}, \texttt{[SEP]}, and \texttt{[MASK]}. Finally, a
+boolean is also sent to update or not an already existing vocabulary and avoid
+the complete re-training of the model.
+
+\begin{lstlisting}[caption=Fitting the BERT Model According to the Provided Walks.,label=bert:fit]
+def fit(self, walks, is_update = False):
+  walks = [walk for entity_walks in walks for walk in entity_walks]
+  nodes = list({node for walk in walks for node in walk})
+  self._build_vocabulary(nodes, is_update)
+  self.tokenizer = BertTokenizer(
+    vocab_file=self.vocab_filename,
+    do_lower_case=False,
+    never_split=nodes,
+  )
+  self.model_ = BertForMaskedLM(
+    BertConfig(
+      vocab_size=self._vocabulary_size,
+      max_position_embeddings=512,
+      type_vocab_size=1,
+      )
+   )
+   Trainer(
+     model=self.model_,
+     args=self.training_args,
+     data_collator=DataCollatorForLanguageModeling(
+       tokenizer=self.tokenizer
+     ),
+     train_dataset=WalkDataset(walks, self.tokenizer),
+   ).train()
+ \end{lstlisting}
+
+In Algorithm \ref{bert:fit}, the training of the BERT model consists of
+retrieving each unique node from the provided walks, tokenizing these walks,
+choosing a suitable hyperparameter config, and running the training based on a
+set of walks data.
+
+BERT needs to be trained on a large enough corpus of walks to provide good
+results. Having such a corpus is not always possible, depending on the size of
+some KGs. In addition, hyperparameters for training BERT impact the model's
+accuracy and training time. This training time can take hours, days, weeks, if
+not more. Therefore it is necessary to improve as much as possible the quantity
+and quality of the walks corpus with RDF2Vec to reduce this training time.
+
+\begin{lstlisting}[caption=Getting the Entities' Embeddings with BERT.,label=bert:embeddings]
+def transform(self, entities):
+  check_is_fitted(self, ["model_"])
+  return [
+    self.model_.bert.embeddings.word_embeddings.weight[
+      self.tokenizer(entity)["input_ids"][1]
+    ]
+    .cpu()
+    .detach()
+    .numpy()
+    for entity in entities
+  ]
+\end{lstlisting}
+
+In Algorithm \ref{bert:embeddings}, the retrieving of embeddings for the
+requested entities, i.e., the root notes, consists of recovering the generated
+embeddings and ensuring their good formatting.
+
+These methods are sufficient for a classical implementation of BERT. All the
+subtlety of generating a correct model is characterized by the extraction and
+injection of walks and the values of chosen hyperparameters.
+
+\section{FastText Implementation}
+\label{sec:fasttext:implementation}
+
+To implement FastText, the
+\texttt{gensim}\footnote{\url{https://github.com/RaRe-Technologies/gensim}}
+library is recommended to be used. However, \texttt{pyRDF2Vec} had to
+reimplement much of the code in order:
+\begin{itemize}
+\item \textbf{To remove the \texttt{min\_n} and \texttt{max\_n} parameters for $n$-grams
+    splitting}: the \texttt{object} nodes in \texttt{pyRDF2Vec} are encoded in MD5
+  to reduce their storage in RAM. Therefore, splitting them into $n$-grams is pointless.
+\item \textbf{To allow a user to compute $n$-grams for walks only by separating} (default
+  split by their symbols) \textbf{the URIs of subjects and predicates}: a user
+  will likely want to provide an alternative splitting strategy for computing entity
+  $n$-grams on the KG. If this is the case, \texttt{pyRDF2Vec} allows a user
+  to implement this function that FastText will use.
+\item \textbf{To avoid dependency on Cython}: Cython is a programming language between
+Python and C. Its main interest is to obtain performances of calculation time
+similar to C for specific functions in Python by reimplementing them in
+Cython. The gensim library uses Cython for the calculation of n-grams
+hashes. However, \texttt{pyRDF2Vec} has chosen to reimplement this function in
+Python to facilitate its use on Google Colaboratory.
+\end{itemize}
+
+\begin{lstlisting}[caption=Reimplementation of the Hash Calculation Functions in
+  \texttt{gensim}.,label=fasttext:hash]
+  def compute_ngrams_bytes(entity):
+    if "http" in entity:
+      ngrams = " ".join(re.split("[#]", entity)).split()
+      return [str.encode(ngram) for ngram in ngrams]
+    return [str.encode(entity)]
+
+    def ft_hash_bytes(self, bytez: bytes) -> int:
+      h = 2166136261
+      for b in bytez:
+        h = h ^ b
+        h = h * 16777619
+      return h
+
+    def ft_ngram_hashes(entity, num_buckets = 2000000):
+      encoded_ngrams = func_computing_ngrams(entity)
+      hashes = [ft_hash_bytes(n) % num_buckets for n in encoded_ngrams]
+      return hashes
+
+    def recalc_char_ngram_buckets(bucket, buckets_word, index_to_key) -> None:
+      if bucket == 0:
+        buckets_word = [np.array([], dtype=np.uint32)] * len(index_to_key)
+        return
+      self.buckets_word = [None] * len(index_to_key)
+      for i, word in enumerate(index_to_key):
+        buckets_word[i] = np.array(ft_ngram_hashes(word, 0, 0, self.bucket), dtype=np.uint32)
+\end{lstlisting}
+
+In Algorithm \ref{fasttext:hash}, the functions present in \texttt{gensim} are
+reimplemented in such a way to include the splitting function of
+\texttt{pyRDF2Vec}. Added to that this reimplementation does not consider the
+use of Cython anymore.
+
+\begin{lstlisting}[caption=Fitting the FastText Model According to the Provided Walks.,label=fasttext:fit]
+def fit(self, walks, is_update = False):
+  corpus = [walk for entity_walks in walks for walk in entity_walks]
+  self._model.build_vocab(corpus, update=is_update)
+  self._model.train(
+    corpus,
+    total_examples=self._model.corpus_count,
+    epochs=self._model.epochs,
+  )
+  return self
+\end{lstlisting}
+
+In Algorithm \ref{fasttext:fit}, training with FastText consists of extracting
+each node from each walk, building the vocabulary, and training the model based
+on the corpus of walks.
+
+\begin{lstlisting}[caption=Getting the Entity Embeddings with FastText.,label=fasttext:transform]
+def transform(self, entities):
+  if not all([entity in self._model.wv for entity in entities]):
+    raise ValueError(
+      "The entities must have been provided to fit() first "
+      "before they can be transformed into a numerical vector."
+    )
+  return [self._model.wv.get_vector(entity) for entity in entities]
+\end{lstlisting}
+
+In Algorithm \ref{fasttext:transform}, even though FastText can generate entity
+embeddings that it has not learned, \texttt{pyRDF2Vec} throws an exception
+instead of avoiding any unpleasant surprises from the model’s accuracy.
+
+\section{SplitWalker}
+\label{sec:split:walker}
+
+Based on the idea of FastText, but directly applied to the extraction of walks,
+this Master's thesis proposes \texttt{SplitWalker} as a new \texttt{type
+2}. Specifically, this strategy splits the vertices of the random walks for a
+based entity. To achieve this, each vertex, except the root node, is split
+according to symbols, capitalization, and numbers by removing any duplication.
+
+\begin{table}[!ht]
+  \centering
+  \resizebox{\textwidth}{!}{%
+    \begin{tabular}{cll}
+      \toprule
+      & \textbf{Initial Node} & \textbf{Node After Splitting} \\
+      \midrule
+      \multirow{4}{*}{\footnotesize Walk 1} & \footnotesize\texttt{http://dl-learner.org/carcinogenesis\#d19} & \footnotesize\texttt{http://dl-learner.org/carcinogenesis\#d19} \\
+      & \multirow{2}{*}{\footnotesize\texttt{http://dl-learner.org/carcinogenesis\#hasBond}} & \footnotesize\texttt{has} \\
+      & & \footnotesize\texttt{bond} \\
+      & \footnotesize\texttt{http://dl-learner.org/carcinogenesis\#bond3209} &\footnotesize\texttt{3209} \\
+      \midrule
+      \multirow{3}{*}{\footnotesize Walk 2} & \footnotesize\texttt{http://dl-learner.org/carcinogenesis\#d19} & \footnotesize\texttt{http://dl-learner.org/carcinogenesis\#d19} \\
+      & \footnotesize\texttt{http://www.w3.org/1999/02/22-rdf-syntax-ns\#type} & \footnotesize\texttt{type} \\
+      & \footnotesize\texttt{http://dl-learner.org/carcinogenesis\#Compound} & \footnotesize\texttt{compound} \\
+      \bottomrule
+    \end{tabular}
+  }%
+  \caption{Example of Use of \texttt{SplitWalker}.}
+  \label{work:performed:splitwalker}
+\end{table}
+
+In Table \ref{work:performed:splitwalker}, two walks are transformed by
+\texttt{SplitWalker}. Both keep their root node intact. However, the first walk
+has the \texttt{.../hasBond} split by the letter \texttt{B} into two nodes:
+\texttt{has} and \texttt{bond}. In addition, its third node
+\texttt{.../bond3209} is also split into two nodes: \texttt{bond} and
+\texttt{3209}, but since \texttt{bond} is an existing node in the walk, it is
+removed from it. Finally, the second walk has the particularity to have a node
+with a capital letter. In this case, this node is rewritten in lowercase.
+
+\begin{lstlisting}[caption=Splits Nodes of Random Walks with \texttt{SplitWalker}.,label=splitwalker:split]
+def basic_split(self, walks):
+  canonical_walks = set()
+  for walk in walks:
+    canonical_walk = [walk[0].name]
+    for i, _ in enumerate(walk[1::], 1):
+      vertices = []
+      if "http" in walk[i].name:
+        vertices = " ".join(re.split("[\#]", walk[i].name)).split()
+      if i % 2 == 1:
+        name = vertices[1] if vertices else walk[i].name
+        preds = [
+          sub_name
+          for sub_name in re.split(r"([A-Z][a-z]*)", name)
+          if sub_name
+        ]
+        for pred in preds:
+          canonical_walk += [pred.lower()]
+      else:
+        name = vertices[-1] if vertices else walk[i].name
+        objs = []
+        try:
+          objs = [str(float(name))]
+        except ValueError:
+          objs = re.sub("[^A-Za-z0-9]+", " ", name).split()
+          if len(objs) == 1:
+            match = re.match(
+              r"([a-z]+)([0-9]+)", objs[0], re.I
+            )
+            if match:
+              objs = list(match.groups())
+        for obj in objs:
+          canonical_walk += [obj.lower()]
+    canonical_walk = list(dict(zip(canonical_walk, canonical_walk)))
+    canonical_walks.add(tuple(canonical_walk))
+  return canonical_walks
+\end{lstlisting}
+
+Algorithm \ref{splitwalker:split} starts by traversing each provided
+walk, making sure to save the root node characterized by the first vertex of the
+walk. Then, this function looks to see if that node has the prefix ``http'' for
+each node. If it does, then that node is split by the \texttt{\#}
+symbol. Otherwise, if the current node is a predicate, this function does an
+uppercase split to create other nodes. Finally, this process is also done in the
+case where the node contains numbers. In the end, this function deletes all the
+duplicated nodes and returns the walks.
+
+\section{WideSampler}
+\label{sec:wide:sampler}
+
+Based on the principle that humans tend to classify objects according to common
+features (e.g., color and shape), this Master's thesis proposes a new sampling
+strategy called \texttt{WideSampler}. This strategy addresses the assumption
+that single entity-specific features would have a negligible impact on the
+quality of the generated embeddings. \texttt{WideSampler} assigns higher weights
+to edges that lead to the most significant number of predicates and objects in
+the neighborhood and terms of occurrence in a graph to extract a maximum of
+shared features between entities.
+
+After training the model, this walking strategy can retrieve the weight of a
+neighboring hop as shown below.
+\begin{algorithm}
+  \caption{\texttt{get\_weight(h, d, c)}}
+  \label{alg:wide:sampler:get:weight}
+  \begin{algorithmic}[1]
+    \REQUIRE a $\mathcal{H}$ 2-tuple that contains a predicate and an object.
+    \REQUIRE a $\mathcal{D}$ array of $n \geq 1$ degree indexed from $0$ to $n - 1$
+    \REQUIRE a $\mathcal{C}$ array of $n \geq 1$ counter of neighbors indexed from $0$ to $n - 1$
+    \ENSURE The weight of the hop for this predicate
+    \IF{$\mathcal{D}_{preds}$ and $\mathcal{D}_{objs}$ and $\mathcal{C}$}
+    \RETURN $\left(\mathcal{C}[\mathcal{H}[0]_{name}]\ +\ \mathcal{C}[\mathcal{H}[1]_{name}]\right)$ $\left(\dfrac{\mathcal{D}_{preds}[\mathcal{H}[0]_{name}] + \mathcal{D}_{objs}[\mathcal{H}[1]_{name}]}{2}\right)$
+    \ENDIF
+  \end{algorithmic}
+\end{algorithm}
+
+Algorithm \ref{alg:wide:sampler:get:weight} assigns a weight to a
+\texttt{(predicate, object)} hop according to the multiplication of two
+sums. The first one considers the child nodes of a predicate and the parent
+nodes of an object node. The second one considers the number of occurrences of
+this predicate and this node in the whole KG. Finally, the second sum is divided
+by half to provide a slight preference to a hop that reaches multiples nodes.
+
+\newpage
+
+\begin{algorithm}
+  \caption{\texttt{fit(v)}}
+  \label{alg:wide:sampler:fit}
+  \begin{algorithmic}[1]
+    \REQUIRE an $\mathcal{V}$ array of $n \geq 1$ edges indexed from $0$ to $n - 1$
+    \ENSURE Fits the WideSampler sampling strategy
+    \STATE $\mathcal{C}_{objs}$ $\leftarrow$ new array of $n$ objects.
+    \STATE $\mathcal{C}_{preds}$ $\leftarrow$ new array of $n$ predicates.
+    \FORALL{$vertex \in \mathcal{V}$}
+    \IF{$vertex$ is predicate}
+    \STATE $\mathcal{C}_{neighbors}[vertex_{name}] = |\texttt{get\_children(vertex)}|$
+    \STATE $\mathcal{C}_{tmp} \leftarrow \mathcal{C}_{preds}$
+    \ELSE
+    \STATE $\mathcal{C}_{neighbors}[vertex_{name}] = |\texttt{get\_parents(vertex)}|$
+    \STATE $\mathcal{C}_{tmp} \leftarrow \mathcal{C}_{objs}$
+    \ENDIF
+
+    \IF{$vertex_{name}$ in $\mathcal{C}_{tmp}$}
+    \STATE $\mathcal{C}_{tmp}[vertex_{name}] \leftarrow \mathcal{C}[vertex_{name}] + 1$
+    \ELSE
+    \STATE $\mathcal{C}_{tmp}[vertex_{name}] \leftarrow 1$
+    \ENDIF
+    \ENDFOR
+  \end{algorithmic}
+\end{algorithm}
+
+Algorithm \ref{alg:wide:sampler:fit} trains the \texttt{WideSampler} strategy by
+iterating through a set of vertices. Depending on the vertex, the algorithm
+considers the number of children or parents of this vertex. If it is a
+predicate, then the number of its child nodes is stored in a counter. Otherwise,
+the algorithm stores the number of parent nodes in another counter. Finally, the
+number of occurrences of each identical vertex in the whole KG is also
+collected.
+
+\section{Library Architecture}
+\label{sec:pyRDF2Vec}
+
+\texttt{pyRDF2Vec}\footnote{\url{https://github.com/IBCNServices/pyRDF2Vec}} is
+the Python library created by IDLab that allows to use RDF2Vec. Through the
+internship and this Master's thesis, this library has undergone many changes in
+its architecture to improve its use.
+\begin{figure}[!ht]
+  \centering
+  \resizebox{\linewidth}{!} {
+    \begin{tikzpicture}[>=stealth',
+      extract_walks/.style={draw,minimum width=.8cm,minimum height=.8cm,fill=mygreen!40},
+      process/.style={draw,minimum width=1cm,minimum height=1cm,node distance=0.35cm,fill=mybrown!20}
+      ]
+    \node[draw,minimum width=2.5cm,minimum height=3cm,loosely dashed,color=darkBlue] at (0,0) (graph_entities) {};
+    \node[draw,minimum width=2cm,minimum height=1cm,yshift=-65pt,above=of graph_entities,fill=myblue] (graph) {Graph};
+    \node[draw,minimum width=2cm,minimum height=1cm,yshift=15pt,below=of graph,fill=myblue] (entities) {Entities};
+    \node[draw,minimum width=2cm,minimum height=1cm,yshift=2pt,above=of graph,fill=myblue!50] (connector) {Connector};
+
+    \node[inner sep=0pt,xshift=-20pt,above=of connector] (rdf) {\includegraphics[width=0.074\textwidth]{img/rdf}};
+    \node[inner sep=0pt,xshift=20pt,above=of connector] (sparql) {\includegraphics[width=0.08\textwidth]{img/sparql}};
+
+    \node[label,below of=graph_entities,yshift=-1.1cm,color=darkBlue] {\textbf{Inputs}};
+
+    \node[draw,minimum width=3cm,minimum height=1cm,right=of graph_entities,fill=myyellow] (transformer) {Transformer};
+    \node[draw,minimum width=2.5cm,minimum height=3cm,loosely dashed,color=darkRed,right=of transformer] (walker_sampler) {};
+    \node[draw,minimum width=2cm,minimum height=1cm,yshift=-65pt,above=of walker_sampler,fill=myred] (walker) {Walker};
+    \node[draw,minimum width=2cm,minimum height=1cm,yshift=15pt,below=of walker,fill=myred] (sampler) {Sampler};
+    \node[draw,minimum width=3cm,minimum height=1cm,above=of transformer,fill=mypurple!50] (embedder) {Embedder};
+    \node[draw,ellipse,minimum width=3cm,minimum height=1cm,above=of embedder,fill=mypurple] (embeddings) {Embeddings};
+    \node[draw,ellipse,minimum width=3cm,minimum height=1cm,node distance=0.5cm,right=of embeddings,fill=mypurple] (literals) {Literals};
+
+    \node[label,above of=walker_sampler,xshift=0.3cm,yshift=20pt,color=darkRed] {\textbf{Strategy}};
+
+    \node[inner sep=0pt,xshift=-50pt,above right=of embeddings] (sklearn) {\includegraphics[width=0.15\textwidth]{img/scikit-learn}};
+
+    \node[process,node distance=9cm,right=of connector] (p1) {P1/T1};
+
+    \node[process,below=of p1] (p2) {P2/T2};
+    \node[process,below=of p2] (p3) {P3/T3};
+    \node[process,below=of p3] (p4) {P4/T4};
+
+    \node[extract_walks,right=of p1] (extract_walks_1) {Extract Walks};
+    \node[extract_walks,right=of p2] (extract_walks_2) {Extract Walks};
+    \node[extract_walks,right=of p3] (extract_walks_3) {Extract Walks};
+    \node[extract_walks,right=of p4] (extract_walks_4) {Extract Walks};
+
+    \node[draw,minimum width=7.5cm,minimum height=1.5cm,xshift=1.8cm,yshift=-2.3cm,above=of embeddings,loosely dashed,color=darkPurple] (output) {};
+    \node[label,above of=output,xshift=2.5cm,yshift=-0.1cm,color=darkPurple] {\textbf{Outputs}};
+
+    \node[label,below of=p4,yshift=-0.10cm] {\dots};
+    \node[label,below of=extract_walks_4,yshift=-0.10cm] {\dots};
+
+    \node[draw,ellipse,minimum width=3cm,minimum height=1cm, node distance=8cm,right=of walker,fill=mygreen] (walks) {Walks};
+
+    \draw[arrow,shorten >=0.2cm,shorten <=0.2cm] (transformer) -- (walker_sampler) node[midway,above] {(2)};
+
+    \draw[arrow,shorten >=0.2cm,shorten <=0.2cm] (rdf.south) -- ([xshift=-20pt]connector.north);
+    \draw[arrow,shorten >=0.2cm,shorten <=0.2cm] (sparql.south) -- ([xshift=20pt]connector.north);
+    \draw[arrow,shorten >=0.05cm,shorten <=0.2cm] (connector) -- (graph);
+
+    \draw[arrow,shorten >=0.2cm,shorten <=0.2cm] (p1) -- (extract_walks_1);
+    \draw[arrow,shorten >=0.2cm,shorten <=0.2cm] (p2) -- (extract_walks_2);
+    \draw[arrow,shorten >=0.2cm,shorten <=0.2cm] (p3) -- (extract_walks_3);
+    \draw[arrow,shorten >=0.2cm,shorten <=0.2cm] (p4) -- (extract_walks_4);
+
+    \draw[arrow,shorten >=0.2cm,shorten <=0.2cm] (literals) -- (sklearn);
+
+    \draw[shorten <=0.1cm] ([yshift=12pt]walker.east) -| ([xshift=-20pt]p1.west);
+    \draw[arrow,shorten >=0.1cm] ([xshift=-20pt]p1.west) -- (p1.west);
+
+    \draw[shorten <=0.1cm] ([yshift=4pt]walker.east) -| ([xshift=-15pt]p2.west);
+    \draw[arrow,shorten >=0.1cm] ([xshift=-15pt]p2.west) -- (p2.west);
+
+    \draw[shorten <=0.1cm] ([yshift=-4pt]walker.east) -| ([xshift=-15pt]p3.west);
+    \draw[arrow,shorten >=0.1cm] ([xshift=-15pt]p3.west) -- (p3.west);
+
+    \draw[shorten <=0.1cm] ([yshift=-12pt]walker.east) -| ([xshift=-20pt]p4.west);
+    \draw[arrow,shorten >=0.1cm] ([xshift=-20pt]p4.west) -- (p4.west);
+
+    \draw[shorten <=0.1cm] (extract_walks_1.east) -| ([xshift=-15pt,yshift=12pt]walks.west);
+    \draw[arrow,shorten >=0.1cm] ([xshift=-15pt,yshift=12pt]walks.west) -- ([yshift=12pt]walks.west);
+
+    \draw[shorten <=0.1cm] (extract_walks_2.east) -| ([xshift=-20pt,yshift=4pt]walks.west);
+    \draw[arrow,shorten >=0.1cm] ([xshift=-20pt,yshift=4pt]walks.west) -- ([yshift=4pt]walks.west);
+
+    \draw[shorten <=0.1cm] (extract_walks_3.east) -| ([xshift=-20pt,yshift=-4pt]walks.west);
+    \draw[arrow,shorten >=0.1cm] ([xshift=-20pt,yshift=-4pt]walks.west) -- ([yshift=-4pt]walks.west);
+
+    \draw[shorten <=0.1cm] (extract_walks_4.east) -| ([xshift=-15pt,yshift=-12pt]walks.west);
+    \draw[arrow,shorten >=0.1cm] ([xshift=-15pt,yshift=-12pt]walks.west) -- ([yshift=-12pt]walks.west);
+
+    \draw[arrow,shorten >=0.1cm,shorten <=0.1cm] (sampler) -- (walker);
+    \draw[arrow,shorten >=0.2cm,shorten <=0.2cm] (transformer) -- (embedder) node[midway,right] {(3)};
+    \draw[arrow,shorten >=0.05cm,shorten <=0.2cm] (embedder) -- (embeddings);
+    \draw[arrow,shorten >=0.2cm,shorten <=0.2cm] (embeddings) -- (sklearn);
+
+    \draw[arrow,shorten >=0.2cm,shorten <=0.2cm] (graph_entities) -- (transformer.west) node[midway,above] {(1)};
+
+    \draw[shorten <=0.2cm] ([xshift=0.2cm]transformer) -- ([yshift=1.3cm]walker_sampler.north);
+    \draw[arrow,shorten >=0.05cm] ([yshift=1.3cm]walker_sampler.north) -- (literals);
+
+    \draw ([xshift=10pt]walks.east) |- ([yshift=-20pt]extract_walks_4.south);
+    \draw[shorten <=0.1cm] (walks) -- ([xshift=10pt]walks.east);
+    \draw[arrow,shorten >=0.2cm] ([yshift=-20pt]extract_walks_4.south) -| (transformer.south);
+  \end{tikzpicture}
+  }
+  \caption{Workflow of \texttt{pyRDF2Vec}.}
+  \label{tikz:workflow}
+\end{figure}
+
+In Figure \ref{tikz:workflow}, the workflow of \texttt{pyRDF2Vec} includes three
+primary operations and is divided into seven main consecutive significant
+blocks:
+\begin{multicols}{2}
+\begin{enumerate}
+\item \textbf{Connector}: in charge of interaction with a local or remote graph.
+\item \textbf{Graph}: in charge of providing a graph encoding the knowledge-based.
+\item \textbf{Entities}: in charge of provides the entities in a
+\texttt{rdflib.URI.term} or \texttt{str} type to generate the embeddings.
+\item \textbf{Transformer}: in charge of converting graphs into embeddings for
+downstream ML tasks, using a walking strategy and sampling strategy and an
+embedder.
+\item \textbf{Sampler}: in charge of prioritizing the use of some paths over
+others using a weight allocation strategy.
+\item \textbf{Walker}: in charge of extracting walks in a KG from provided
+entities and optionally from a sampling strategy using multiple
+processors/threads.
+\item \textbf{Embedder}: in charge of training a model with an embedding
+technique using extracted walks and therefore generate embeddings of entities
+provided by a user.
+\end{enumerate}
+\end{multicols}
+
+The design of such architecture allows having a long-term vision. Due to this
+architecture, a user can contribute to \texttt{pyRDF2Vec} and easily add new
+walking strategies and sampling and embedding techniques. For this Master’s
+thesis, implementing this architecture was necessary to facilitate the
+comparison of embedding techniques. Each new embedding technique is added in an
+\texttt{Embedder} package and must reimplement the \texttt{fit} and
+\texttt{get\_weight} functions of the \texttt{Embedder} class.
+
+%%% Local Variables:
+%%% mode: latex
+%%% TeX-master: "latex"
+%%% End: