Skip to content

Algolia search #7

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions backend/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
aniso8601==7.0.0
algoliasearch==2.0.4
ansicolors==1.1.8
asn1crypto==0.24.0
certifi==2019.3.9
Expand All @@ -19,6 +20,7 @@ Jinja2==2.10.1
jwcrypto==0.6.0
MarkupSafe==1.1.1
marshmallow==3.0.1
mistune==0.8.4
pycparser==2.19
PyGithub==1.43.7
Pygments==2.4.2
Expand Down
Empty file added backend/search/__init__.py
Empty file.
29 changes: 29 additions & 0 deletions backend/search/client.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import os

from algoliasearch.search_client import SearchClient

from tools import logger

ALGOLIA_APP_ID = os.getenv("ALGOLIA_APP_ID")
ALGOLIA_ADMIN_KEY = os.getenv("ALGOLIA_ADMIN_KEY")
ALGOLIA_SEARCH_KEY = os.getenv("ALGOLIA_SEARCH_KEY")

INDEX_NAME = "test_DOCUMENTATION"

try:
CLIENT = SearchClient.create(ALGOLIA_APP_ID, ALGOLIA_ADMIN_KEY)
INDEX = CLIENT.init_index(INDEX_NAME)
except Exception:
logger.get_logger().exception("Failed to initialise algolia client")


def insert_doc(doc):
success = True

try:
INDEX.save_objects([doc])
except Exception:
logger.get_logger().exception("Failed to index markdown document %s", doc)
success = False

return success
183 changes: 183 additions & 0 deletions backend/search/markdown_indexer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
import hashlib

import mistune
import re

from search import client
from tools import logger
from utils.exceptions.search_exceptions import IndexingException


class CustomRenderer(mistune.Renderer):

def __init__(self, title, source):
super().__init__()

self.title = title
self.source = source

self.current_header = []
self.tables = []
self.lists = []

# Insert the document title
self._insert_document([], None, type='title')

def header(self, text, level, raw=None):
self._add_header(text, level)
self._insert_document(self.current_header, None, type='header')

return super().header(text, level, raw)

def paragraph(self, text):
# Insert only if not an image or a link alone
if not re.compile('^<(img|a).*>$').search(text):
self._insert_document(self.current_header, text, type='paragraph')

return super().paragraph(text)

def _add_header(self, text, level):
if self.current_header:
if self.current_header[-1].get('level') == level:
self.current_header.pop()
self.current_header.append({
'h': text,
'level': level
})

elif self.current_header[-1].get('level') > level:
self.current_header.pop()
self._add_header(text, level)

elif self.current_header[-1].get('level') < level:
self.current_header.append({
'h': text,
'level': level
})

else:
self.current_header.append({
'h': text,
'level': level
})

#
# Inserters into the documents
#

def _insert_document(self, headers, content, type):
"""
Main method inserting all the field into using the search client
"""
doc = {
'title': self.title,
'source': self.source,
'type': type,
}

self._insert_headers(doc, headers)
self._insert_content(doc, content)
self._insert_link(doc, headers)
self._insert_importance(doc, headers, content)
self._insert_document_hash(doc, headers, content)

client.insert_doc(doc)

def _insert_headers(self, doc, headers):
"""
We insert headers based on their level of depth
"""

for header in headers:
level = header['level']
h = header['h']

doc['h' + str(level)] = h

def _insert_content(self, doc, content):
"""
We insert content only when it exists (it is not present for titles)
"""
if content:
doc['content'] = content

def _insert_link(self, doc, headers):
"""
We generate a link to the first header of the document
FIXME: we can go the a second or third layer of depth when linking to a document part
"""
doc['link'] = self.title + ("#" + str(headers[0]['h']) if headers else ''),

def _insert_importance(self, doc, headers, content):
"""
The importance is defined as a number between 0 and 7, where 0 is for document title and 7 is for a paragraph
inside 5 layers of titles. The lowest the importance is, the higher the result will be shown for the same result
(see algolia tie-breaking algorithm)
"""
doc['importance'] = len(headers) + (1 if content else 0)

def _insert_document_hash(self, doc, headers, content):
"""
We hash the content of the file so we are sure not to index 2 times the same file
"""
file_string = self.title + ''.join([h['h'] for h in headers]) + (content if content else '')
doc['objectID'] = hashlib.md5(file_string.encode("utf-8")).hexdigest()


#
# NOT NEEDED
#

# TODO: we do not index code as hard to deal with it (blocks can be huge so hard to display them in the search)
def block_code(self, code, lang=None):
return super().block_code(code, lang)

# TODO: same problem with tables
def table(self, header, body):
return super().table(header, body)

# TODO: same problem with lists, although more manageable
def list(self, body, ordered=True):
return super().list(body, ordered)

def image(self, src, title, text):
return super().image(src, title, text)

def inline_html(self, html):
return super().inline_html(html)

def codespan(self, text): # inline code
return super().codespan(text)

def text(self, text):
return super().text(text)

def autolink(self, link, is_email=False):
return super().autolink(link, is_email)

def link(self, link, title, text):
return super().link(link, title, text)

def table_cell(self, content, **flags):
return super().table_cell(content, flags)

def table_row(self, content):
return super().table_row(content)

def list_item(self, text):
return super().list_item(text)


def insert_markdown_doc(source, title, content):
renderer = CustomRenderer(title, source)
markdown = mistune.Markdown(renderer=renderer)

success = True

try:
markdown(content)
except IndexingException:
logger.get_logger().exception("Failed to index markdown document %s", title)
success = False

return success
Empty file.
4 changes: 4 additions & 0 deletions backend/utils/exceptions/search_exceptions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
class IndexingException(Exception):
"""
Exception raised when we failed to index a document
"""
5 changes: 4 additions & 1 deletion backend/web_server/endpoints/abstract_endpoint.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from flask import jsonify, request
from flask_restful import Resource
from flask_restful import Resource, abort


class AbstractEndpoint(Resource):
Expand Down Expand Up @@ -42,3 +42,6 @@ def _create_validated_response(self, json_or_object):

def _create_empty_response(self):
return jsonify({})

def _create_error_response(self, code, msg):
return abort(code, message=msg)
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
from flask import request
from flask_restful import abort

from mongo.collection_clients.clients.db_doc_client import DbDocClient
from mongo.constants.model_fields import ModelFields
from mongo.models.db_doc_model import DbDocModel
from tools import logger
from search import markdown_indexer
from web_server.endpoints.user_endpoints.account_endpoints.abstract_user_account_endpoint import AbstractAccountEndpoint


Expand All @@ -14,13 +12,25 @@ class AccountSaveEndpoint(AbstractAccountEndpoint):
"""

def post(self, github_account_login):
if DbDocClient().find_one(github_account_login, request.get_json()[ModelFields.NAME]):
logger.get_logger().error("The document name %s already exist", request.get_json()[ModelFields.NAME])
return abort(400, message='Document name already exists')
# FIXME: today, we can override docs if they have the same name, in the future, we will have a document id

raise Exception("Problem")

new_doc = request.get_json()
new_doc[DbDocModel.GITHUB_ACCOUNT_LOGIN_FIELD] = github_account_login

DbDocClient().insert_one(new_doc)

# We index the new markdown document for search
title = new_doc[DbDocModel.NAME_FIELD]
content = new_doc[DbDocModel.CONTENT_FIELD]
indexing_success = markdown_indexer.insert_markdown_doc(
source='app',
title=title,
content=content
)

if not indexing_success:
return self._create_error_response(code=400)

return self._create_empty_response()
8 changes: 8 additions & 0 deletions remarks.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# Remarks

- no error handling in mongo responses (should return True or false if call was successful)
- logging error should use logger.exception and not logger.error. Excpetion methods automatically cathces and print the
error so no need to say we ant to print it
- create and use custom exceptions in the folder `utils/exceptions`

- stop putting `WARNING` logs everywhere! They should be exclusive to errors that are non critical to running the app
6 changes: 5 additions & 1 deletion set_prod_env.sh
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
#!/usr/bin/env bash

export MONGO_URL="mongodb+srv://prod_user:[email protected]/test?retryWrites=true&w=majority"
export MONGO_URL="mongodb+srv://prod_user:[email protected]/test?retryWrites=true&w=majority"

export ALGOLIA_APP_ID="5CEV8V2XX3"
export ALGOLIA_SEARCH_KEY="6bd7125662303694213d4fbbffa9b882"
export ALGOLIA_ADMIN_KEY="cacc8468e45c3eb5c7e346c31f49407f"