satpugnet · paulvidal · Aug 18, 2019 · Aug 31, 2019 · Aug 31, 2019 · Aug 31, 2019
diff --git a/backend/requirements.txt b/backend/requirements.txt
@@ -1,4 +1,5 @@
 aniso8601==7.0.0
+algoliasearch==2.0.4
 ansicolors==1.1.8
 asn1crypto==0.24.0
 certifi==2019.3.9
@@ -19,6 +20,7 @@ Jinja2==2.10.1
 jwcrypto==0.6.0
 MarkupSafe==1.1.1
 marshmallow==3.0.1
+mistune==0.8.4
 pycparser==2.19
 PyGithub==1.43.7
 Pygments==2.4.2

diff --git a/backend/search/__init__.py b/backend/search/__init__.py
diff --git a/backend/search/client.py b/backend/search/client.py
@@ -0,0 +1,29 @@
+import os
+
+from algoliasearch.search_client import SearchClient
+
+from tools import logger
+
+ALGOLIA_APP_ID = os.getenv("ALGOLIA_APP_ID")
+ALGOLIA_ADMIN_KEY = os.getenv("ALGOLIA_ADMIN_KEY")
+ALGOLIA_SEARCH_KEY = os.getenv("ALGOLIA_SEARCH_KEY")
+
+INDEX_NAME = "test_DOCUMENTATION"
+
+try:
+    CLIENT = SearchClient.create(ALGOLIA_APP_ID, ALGOLIA_ADMIN_KEY)
+    INDEX = CLIENT.init_index(INDEX_NAME)
+except Exception:
+    logger.get_logger().exception("Failed to initialise algolia client")
+
+
+def insert_doc(doc):
+    success = True
+
+    try:
+        INDEX.save_objects([doc])
+    except Exception:
+        logger.get_logger().exception("Failed to index markdown document %s", doc)
+        success = False
+
+    return success
diff --git a/backend/search/markdown_indexer.py b/backend/search/markdown_indexer.py
@@ -0,0 +1,183 @@
+import hashlib
+
+import mistune
+import re
+
+from search import client
+from tools import logger
+from utils.exceptions.search_exceptions import IndexingException
+
+
+class CustomRenderer(mistune.Renderer):
+
+    def __init__(self, title, source):
+        super().__init__()
+
+        self.title = title
+        self.source = source
+
+        self.current_header = []
+        self.tables = []
+        self.lists = []
+
+        # Insert the document title
+        self._insert_document([], None, type='title')
+
+    def header(self, text, level, raw=None):
+        self._add_header(text, level)
+        self._insert_document(self.current_header, None, type='header')
+
+        return super().header(text, level, raw)
+
+    def paragraph(self, text):
+        # Insert only if not an image or a link alone
+        if not re.compile('^<(img|a).*>$').search(text):
+            self._insert_document(self.current_header, text, type='paragraph')
+
+        return super().paragraph(text)
+
+    def _add_header(self, text, level):
+        if self.current_header:
+            if self.current_header[-1].get('level') == level:
+                self.current_header.pop()
+                self.current_header.append({
+                    'h': text,
+                    'level': level
+                })
+
+            elif self.current_header[-1].get('level') > level:
+                self.current_header.pop()
+                self._add_header(text, level)
+
+            elif self.current_header[-1].get('level') < level:
+                self.current_header.append({
+                    'h': text,
+                    'level': level
+                })
+
+        else:
+            self.current_header.append({
+                'h': text,
+                'level': level
+            })
+
+    #
+    # Inserters into the documents
+    #
+
+    def _insert_document(self, headers, content, type):
+        """
+        Main method inserting all the field into using the search client
+        """
+        doc = {
+            'title': self.title,
+            'source': self.source,
+            'type': type,
+        }
+
+        self._insert_headers(doc, headers)
+        self._insert_content(doc, content)
+        self._insert_link(doc, headers)
+        self._insert_importance(doc, headers, content)
+        self._insert_document_hash(doc, headers, content)
+
+        client.insert_doc(doc)
+
+    def _insert_headers(self, doc, headers):
+        """
+        We insert headers based on their level of depth
+        """
+
+        for header in headers:
+            level = header['level']
+            h = header['h']
+
+            doc['h' + str(level)] = h
+
+    def _insert_content(self, doc, content):
+        """
+        We insert content only when it exists (it is not present for titles)
+        """
+        if content:
+            doc['content'] = content
+
+    def _insert_link(self, doc, headers):
+        """
+        We generate a link to the first header of the document
+        FIXME: we can go the a second or third layer of depth when linking to a document part
+        """
+        doc['link'] = self.title + ("#" + str(headers[0]['h']) if headers else ''),
+
+    def _insert_importance(self, doc, headers, content):
+        """
+        The importance is defined as a number between 0 and 7, where 0 is for document title and 7 is for a paragraph
+        inside 5 layers of titles. The lowest the importance is, the higher the result will be shown for the same result
+        (see algolia tie-breaking algorithm)
+        """
+        doc['importance'] = len(headers) + (1 if content else 0)
+
+    def _insert_document_hash(self, doc, headers, content):
+        """
+         We hash the content of the file so we are sure not to index 2 times the same file
+        """
+        file_string = self.title + ''.join([h['h'] for h in headers]) + (content if content else '')
+        doc['objectID'] = hashlib.md5(file_string.encode("utf-8")).hexdigest()
+
+
+    #
+    # NOT NEEDED
+    #
+
+    # TODO: we do not index code as hard to deal with it (blocks can be huge so hard to display them in the search)
+    def block_code(self, code, lang=None):
+        return super().block_code(code, lang)
+
+    # TODO: same problem with tables
+    def table(self, header, body):
+        return super().table(header, body)
+
+    # TODO: same problem with lists, although more manageable
+    def list(self, body, ordered=True):
+        return super().list(body, ordered)
+
+    def image(self, src, title, text):
+        return super().image(src, title, text)
+
+    def inline_html(self, html):
+        return super().inline_html(html)
+
+    def codespan(self, text): # inline code
+        return super().codespan(text)
+
+    def text(self, text):
+        return super().text(text)
+
+    def autolink(self, link, is_email=False):
+        return super().autolink(link, is_email)
+
+    def link(self, link, title, text):
+        return super().link(link, title, text)
+
+    def table_cell(self, content, **flags):
+        return super().table_cell(content, flags)
+
+    def table_row(self, content):
+        return super().table_row(content)
+
+    def list_item(self, text):
+        return super().list_item(text)
+
+
+def insert_markdown_doc(source, title, content):
+    renderer = CustomRenderer(title, source)
+    markdown = mistune.Markdown(renderer=renderer)
+
+    success = True
+
+    try:
+        markdown(content)
+    except IndexingException:
+        logger.get_logger().exception("Failed to index markdown document %s", title)
+        success = False
+
+    return success
diff --git a/backend/utils/exceptions/__init__.py b/backend/utils/exceptions/__init__.py
diff --git a/backend/utils/exceptions/search_exceptions.py b/backend/utils/exceptions/search_exceptions.py
@@ -0,0 +1,4 @@
+class IndexingException(Exception):
+    """
+    Exception raised when we failed to index a document
+    """
diff --git a/backend/web_server/endpoints/abstract_endpoint.py b/backend/web_server/endpoints/abstract_endpoint.py
@@ -1,5 +1,5 @@
 from flask import jsonify, request
-from flask_restful import Resource
+from flask_restful import Resource, abort
 
 
 class AbstractEndpoint(Resource):
@@ -42,3 +42,6 @@ def _create_validated_response(self, json_or_object):
 
     def _create_empty_response(self):
         return jsonify({})
+
+    def _create_error_response(self, code, msg):
+        return abort(code, message=msg)
diff --git a/backend/web_server/endpoints/user_endpoints/account_endpoints/account_save_endpoint.py b/backend/web_server/endpoints/user_endpoints/account_endpoints/account_save_endpoint.py
@@ -1,10 +1,8 @@
 from flask import request
-from flask_restful import abort
 
 from mongo.collection_clients.clients.db_doc_client import DbDocClient
-from mongo.constants.model_fields import ModelFields
 from mongo.models.db_doc_model import DbDocModel
-from tools import logger
+from search import markdown_indexer
 from web_server.endpoints.user_endpoints.account_endpoints.abstract_user_account_endpoint import AbstractAccountEndpoint
 
 
@@ -14,13 +12,25 @@ class AccountSaveEndpoint(AbstractAccountEndpoint):
     """
 
     def post(self, github_account_login):
-        if DbDocClient().find_one(github_account_login, request.get_json()[ModelFields.NAME]):
-            logger.get_logger().error("The document name %s already exist", request.get_json()[ModelFields.NAME])
-            return abort(400, message='Document name already exists')
+        # FIXME: today, we can override docs if they have the same name, in the future, we will have a document id
+
+        raise Exception("Problem")
 
         new_doc = request.get_json()
         new_doc[DbDocModel.GITHUB_ACCOUNT_LOGIN_FIELD] = github_account_login
 
         DbDocClient().insert_one(new_doc)
 
+        # We index the new markdown document for search
+        title = new_doc[DbDocModel.NAME_FIELD]
+        content = new_doc[DbDocModel.CONTENT_FIELD]
+        indexing_success = markdown_indexer.insert_markdown_doc(
+            source='app',
+            title=title,
+            content=content
+        )
+
+        if not indexing_success:
+            return self._create_error_response(code=400)
+
         return self._create_empty_response()
diff --git a/remarks.md b/remarks.md
@@ -0,0 +1,8 @@
+# Remarks
+
+- no error handling in mongo responses (should return True or false if call was successful)
+- logging error should use logger.exception and not logger.error. Excpetion methods automatically cathces and print the 
+error so no need to say we ant to print it
+- create and use custom exceptions in the folder `utils/exceptions`
+
+- stop putting `WARNING` logs everywhere! They should be exclusive to errors that are non critical to running the app
diff --git a/set_prod_env.sh b/set_prod_env.sh
@@ -1,3 +1,7 @@
 #!/usr/bin/env bash
 
-export MONGO_URL="mongodb+srv://prod_user:[email protected]/test?retryWrites=true&w=majority"
+export MONGO_URL="mongodb+srv://prod_user:[email protected]/test?retryWrites=true&w=majority"
+
+export ALGOLIA_APP_ID="5CEV8V2XX3"
+export ALGOLIA_SEARCH_KEY="6bd7125662303694213d4fbbffa9b882"
+export ALGOLIA_ADMIN_KEY="cacc8468e45c3eb5c7e346c31f49407f"