feat: Allow html python notebook (#223)

orronai · OdeYec · web-flow · commit 8b1914e9256d · 2020-10-05T10:56:20.000+03:00
* added support for html header in notebook cells

- Fixed the type getter function
- Fixed the unittest for notebooks extractor
- Changed the constant in utils.files to be a set
  of the LANGUAGES_EXTENSIONS_TO_NAMES constant

Co-authored-by: Odelia Yechiel &lt;odechann@gmail.com&gt;
diff --git a/lms/extractors/base.py b/lms/extractors/base.py
@@ -54,18 +54,18 @@ def _split_header(cls, code: CodeFile) -> Tuple[str, str]:
         first_line = clean_text[:first_line_end].strip().replace('_', ' ')
         code_lines = clean_text[first_line_end:].strip()
 
-        log.debug(f'Upload title: {first_line}')
         return first_line, code_lines
 
     @classmethod
-    def _clean(cls, code: Union[Sequence, str]) -> Tuple[int, str]:
+    def _clean(cls, code: CodeFile) -> Tuple[int, str]:
         first_line, code_text = cls._split_header(code)
+        log.debug(f'Upload title: {first_line}.')
         upload_title = cls.UPLOAD_TITLE.fullmatch(first_line)
         if upload_title:
             exercise_id = int(upload_title.group(1))
             return exercise_id, code_text
 
-        log.debug(f'Unmatched title: {first_line}')
+        log.debug(f'Unmatched title: {first_line}.')
         return 0, ''
 
     def get_exercise(self, to_extract: Any) -> Tuple[int, List[File]]:
diff --git a/lms/extractors/notebook.py b/lms/extractors/notebook.py
@@ -1,8 +1,11 @@
 import itertools
 import json
+import re
 from typing import Any, Dict, Iterator, List, Tuple
 
 from lms.extractors.base import Extractor, File
+from lms.utils.files import ALLOWED_EXTENSIONS
+from lms.utils.log import log
 
 
 NotebookJson = Dict[str, Any]
@@ -13,6 +16,8 @@ class Notebook(Extractor):
     POSSIBLE_JSON_EXCEPTIONS = (
         json.JSONDecodeError, KeyError, StopIteration, UnicodeDecodeError,
     )
+    TYPE_LINE_PREFIX = re.compile(r'type:\s+(\w+)', re.IGNORECASE)
+    DEFAULT_FILE_TYPE = 'py'
 
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
@@ -41,10 +46,25 @@ def _get_code_cells(self) -> Iterator[Cell]:
         cells = notebook['cells']
         yield from filter(self._is_code_cell, cells)
 
+    def _get_file_type(self, code: str) -> Tuple[str, str]:
+        type_line, code_lines = self._split_header(code)
+        file_type_match = self.TYPE_LINE_PREFIX.fullmatch(type_line)
+
+        if file_type_match:
+            file_type = file_type_match.group(1)
+            if file_type not in ALLOWED_EXTENSIONS:
+                file_type = self.DEFAULT_FILE_TYPE
+            log.debug(f'File type: {file_type}.')
+            return code_lines, file_type
+
+        log.debug('No file type defined.')
+        return code, self.DEFAULT_FILE_TYPE
+
     def get_exercise(self, to_extract: Cell) -> Tuple[int, List[File]]:
         code: List[str] = to_extract.get('source', [])
         exercise_id, clean_code = self._clean(code)
-        return (exercise_id, [File('/main.py', clean_code)])
+        clean_code, ext = self._get_file_type(clean_code)
+        return (exercise_id, [File(f'/main.{ext}', clean_code)])
 
     def get_exercises(self) -> Iterator[Tuple[int, List[File]]]:
         """Yield exercise ID and code from notebook."""
diff --git a/lms/extractors/textfile.py b/lms/extractors/textfile.py
@@ -2,6 +2,7 @@
 
 from lms.extractors.base import Extractor, File
 from lms.models.errors import BadUploadFile
+from lms.utils.files import ALLOWED_EXTENSIONS
 
 
 TEXTCHARS = set(bytes(
@@ -11,14 +12,12 @@
 
 
 class Textfile(Extractor):
-    ALLOWED_EXTENSIONS = {'css', 'html', 'js', 'py', 'sql'}
-
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
         self.filename_no_ext, _, self.ext = self.filename.rpartition('.')
 
     def can_extract(self) -> bool:
-        if self.ext not in self.ALLOWED_EXTENSIONS:
+        if self.ext not in ALLOWED_EXTENSIONS:
             return False
         if isinstance(self.file_content, str):
             return True
@@ -30,7 +29,7 @@ def get_exercise(self, to_extract: str) -> Tuple[int, List[File]]:
             exercise_id, _ = self._clean(self.filename_no_ext)
             content = to_extract
         if not exercise_id:
-            raise BadUploadFile("Can't resolve exercise id", self.filename)
+            raise BadUploadFile("Can't resolve exercise id.", self.filename)
 
         return (exercise_id, [File(f'/main.{self.ext}', content)])
 
diff --git a/lms/tests/samples/upload-1-2.ipynb b/lms/tests/samples/upload-1-2.ipynb
@@ -57,6 +57,47 @@
     "מקווה שלא יכשל על זה"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# upload 567\n",
+    "# type: 123\n",
+    "\n",
+    "וזה יבוא\n",
+    "אתה תראה"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# upload 122\n",
+    "# type: html\n",
+    "\n",
+    "<html>\n",
+    "<body>\n",
+    "<p>I wish I could fly</p>\n",
+    "</body>\n",
+    "</html>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# upload 23\n",
+    "# type: exe\n",
+    "\n",
+    "Fire in the hole"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
diff --git a/lms/tests/test_extractor.py b/lms/tests/test_extractor.py
@@ -73,9 +73,12 @@ def get_zip_filenames(self):
 
     def test_notebook(self):
         results = list(extractor.Extractor(self.ipynb_storage))
-        assert len(results) == 2
+        assert len(results) == 5
         assert results[0][0] == 3141
         assert results[1][0] == 2
+        assert results[2][1][0].path.endswith('.py')
+        assert results[3][1][0].path.endswith('.html')
+        assert results[4][1][0].path.endswith('.py')
         solution = extractor.Extractor(self.pyfiles_storage[1]).file_content
         solution = solution.replace('# Upload 3141', '')
         assert results[0][1][0].code == solution.strip()
diff --git a/lms/utils/files.py b/lms/utils/files.py
@@ -1,7 +1,9 @@
 LANGUAGE_EXTENSIONS_TO_NAMES = {
     'bat': 'batch',
+    'css': 'css',
     'h': 'c',
     'htm': 'html',
+    'html': 'html',
     'js': 'javascript',
     'md': 'markup',
     'ps1': 'powershell',
@@ -14,6 +16,8 @@
     'yml': 'yaml',
 }
 
+ALLOWED_EXTENSIONS = set(LANGUAGE_EXTENSIONS_TO_NAMES)
+
 
 def get_language_name_by_extension(ext: str) -> str:
     return LANGUAGE_EXTENSIONS_TO_NAMES.get(ext, ext)