From 824407f719e547195b83fcc696bdc110e9331265 Mon Sep 17 00:00:00 2001 From: katanasovski Date: Tue, 21 Dec 2021 14:41:22 +0100 Subject: [PATCH] lining --- .github/workflows/pre-commit.yml | 26 + .pre-commit-config.yaml | 46 +- CONTRIBUTING.md | 10 +- README.md | 8 +- docs/README.rst | 1 - docs/_templates/layout.html | 4 +- docs/chefops.md | 1 - docs/community/authors.rst | 2 +- docs/community/contributing.md | 2 +- docs/concepts/reviewing_channels.md | 1 - docs/concepts/terminology.md | 1 - docs/conf.py | 163 +-- docs/developer/corrections.md | 4 +- docs/developer/ids.md | 3 +- docs/developer/index.rst | 1 - docs/developer/kolibripreview.md | 4 +- docs/developer/uploadprocess.md | 14 +- docs/downloader.md | 2 - docs/examples/detokenify.pl | 1 - docs/examples/exercises.ipynb | 2 +- docs/examples/languages.ipynb | 2 +- docs/files.md | 1 - docs/history.rst | 1 - docs/htmlapps.md | 3 +- docs/parsing_html.md | 1 - docs/pdfutils.md | 9 +- docs/tutorial/jiro.md | 2 - docs/tutorial/quickstart.rst | 1 - docs/video_compression.md | 1 - examples/gettingstarted/sushichef.py | 4 +- examples/oldexamples/README.md | 1 - ...0c0f1a1a40226d8d227a07dd143f8c08a4b8a5.svg | 2 +- examples/oldexamples/content/captions.vtt | 1 - examples/oldexamples/large_wikipedia_chef.py | 58 +- examples/oldexamples/sample_program.py | 316 +++-- examples/oldexamples/wikipedia_video_chef.py | 72 +- examples/tutorial/sushichef.py | 75 +- examples/wikipedia/README.md | 1 - examples/wikipedia/sushichef.py | 60 +- resources/scripts/convertvideo.bat | 4 +- resources/scripts/convertvideo.sh | 2 +- resources/templates/csv_channel/Channel.csv | 2 +- resources/templates/csv_channel/Content.csv | 2 +- .../csv_channel/ExerciseQuestions.csv | 2 +- resources/templates/csv_channel/Exercises.csv | 2 +- resources/templates/csv_channel/csvchef.py | 5 +- ricecooker/__init__.py | 6 +- ricecooker/chefs.py | 513 +++++--- ricecooker/classes/files.py | 409 ++++-- ricecooker/classes/licenses.py | 176 ++- ricecooker/classes/nodes.py | 1126 +++++++++++------ ricecooker/classes/questions.py | 462 ++++--- ricecooker/cli.py | 160 +-- ricecooker/commands.py | 217 ++-- ricecooker/config.py | 254 ++-- ricecooker/exceptions.py | 63 +- ricecooker/managers/progress.py | 190 +-- ricecooker/managers/tree.py | 293 +++-- .../{sushichef.py => sushichef.py.template} | 0 ricecooker/utils/browser.py | 22 +- ricecooker/utils/caching.py | 28 +- ricecooker/utils/corrections.py | 482 ++++--- ricecooker/utils/downloader.py | 493 +++++--- ricecooker/utils/encodings.py | 40 +- ricecooker/utils/html.py | 102 +- ricecooker/utils/html_writer.py | 96 +- ricecooker/utils/images.py | 59 +- ricecooker/utils/jsontrees.py | 378 +++--- ricecooker/utils/kolibripreview.py | 23 +- ricecooker/utils/libstudio.py | 106 +- ricecooker/utils/linecook.py | 230 ++-- ricecooker/utils/metadata_provider.py | 554 ++++---- ricecooker/utils/paths.py | 4 +- ricecooker/utils/pdf.py | 101 +- ricecooker/utils/proxy.py | 71 +- ricecooker/utils/subtitles.py | 49 +- ricecooker/utils/thumbscropping.py | 35 +- ricecooker/utils/tokens.py | 20 +- ricecooker/utils/utils.py | 6 +- ricecooker/utils/videos.py | 107 +- ricecooker/utils/web.py | 34 +- ricecooker/utils/youtube.py | 244 ++-- ricecooker/utils/zip.py | 9 +- setup.py | 50 +- tests/conftest.py | 409 +++--- tests/media_utils/README.md | 13 +- .../files/assets/images/copyright.txt | 1 - tests/media_utils/files/page_with_links.html | 2 +- tests/media_utils/files/subtitles/basic.srt | 57 +- tests/media_utils/files/subtitles/basic.vtt | 56 +- tests/media_utils/files/subtitles/empty.ttml | 2 +- .../files/subtitles/encapsulated.sami | 2 +- tests/media_utils/files/subtitles/not.txt | 2 +- tests/media_utils/test_proxy.py | 44 +- tests/media_utils/test_subtitles.py | 72 +- tests/media_utils/test_thumbnails.py | 149 ++- tests/media_utils/test_videos.py | 137 +- tests/media_utils/test_web.py | 40 +- tests/media_utils/test_youtube.py | 149 ++- tests/test_argparse.py | 112 +- tests/test_csv_metadata.py | 44 +- tests/test_data.py | 189 ++- tests/test_downloader.py | 57 +- tests/test_exercises.py | 497 +++++--- tests/test_files.py | 415 ++++-- tests/test_licenses.py | 70 +- tests/test_links.py | 54 +- tests/test_pdfutils.py | 286 +++-- tests/test_requests.py | 35 +- tests/test_settings.py | 49 +- tests/test_thumbnails.py | 228 ++-- tests/test_tree.py | 345 ++--- tests/test_videos.py | 234 ++-- tests/test_youtube.py | 37 +- .../csv_channel_with_exercises/Channel.csv | 2 +- .../csv_channel_with_exercises/Content.csv | 2 +- .../ExerciseQuestions.csv | 2 +- .../csv_channel_with_exercises/Exercises.csv | 2 +- ...3f3bf7c317408ee90995b5bcf4f3a59606aedd.svg | 2 +- .../testcontent/samples/testsubtitles_ar.srt | 1 - 120 files changed, 7250 insertions(+), 4686 deletions(-) create mode 100644 .github/workflows/pre-commit.yml rename ricecooker/templates/{sushichef.py => sushichef.py.template} (100%) diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml new file mode 100644 index 00000000..5c8e21a0 --- /dev/null +++ b/.github/workflows/pre-commit.yml @@ -0,0 +1,26 @@ +name: Linting + +on: [push, pull_request] + +jobs: + pre_job: + name: Path match check + runs-on: ubuntu-latest + # Map a step output to a job output + outputs: + should_skip: ${{ steps.skip_check.outputs.should_skip }} + steps: + - id: skip_check + uses: fkirc/skip-duplicate-actions@master + with: + github_token: ${{ github.token }} + paths_ignore: '["**.po", "**.json"]' + linting: + name: All file linting + needs: pre_job + if: ${{ needs.pre_job.outputs.should_skip != 'true' }} + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - uses: actions/setup-python@v2 + - uses: pre-commit/action@v2.0.0 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 40144b97..5ecc9444 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,19 +1,27 @@ -- repo: git://github.com/pre-commit/pre-commit-hooks - rev: v2.5.0 - hooks: - - id: trailing-whitespace - - id: flake8 - - id: check-yaml - - id: check-added-large-files - - id: debug-statements - - id: end-of-file-fixer - exclude: '^.+?\.json$' -- repo: git://github.com/asottile/reorder_python_imports - rev: v2.1.0 - hooks: - - id: reorder-python-imports -- repo: git://github.com/pre-commit/mirrors-eslint - sha: v3.14.0 - hooks: - - id: eslint - additional_dependencies: ['eslint', 'eslint-plugin-html', 'eslint-config-airbnb', 'eslint-plugin-import', 'eslint-plugin-jsx-a11y'] +repos: + - repo: git://github.com/pre-commit/pre-commit-hooks + rev: v2.5.0 + hooks: + - id: trailing-whitespace + exclude: '^.+?\.template$' + - id: flake8 + exclude: '^.+?\.template$' + - id: check-yaml + exclude: '^.+?\.template$' + - id: check-added-large-files + exclude: '^.+?\.template$' + - id: debug-statements + exclude: '^.+?\.template$' + - id: end-of-file-fixer + exclude: '^.+?(\.json|\.template)$' + + - repo: git://github.com/asottile/reorder_python_imports + rev: v2.1.0 + hooks: + - id: reorder-python-imports + exclude: '^.+?\.template$' + - repo: https://github.com/python/black + rev: 20.8b1 + hooks: + - id: black + exclude: '^.+?\.template$' diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 0f2b6bb0..e972d57e 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -80,7 +80,7 @@ Here are the steps for setting up `ricecooker` for local development: 3. Create a Python virtual environment for this project (optional, but recommended): - * Install the `virtualenv` package using the command + * Install the `virtualenv` package using the command ``` pip install virtualenv ``` @@ -88,11 +88,11 @@ Here are the steps for setting up `ricecooker` for local development: * The next steps depends if you're using a UNIX system (Mac/Linux) or Windows: * For UNIX operating systems: * Create a virtual env called `venv` in the current directory using the - command: + command: ``` virtualenv -p python3 venv ``` - * Activate the virtualenv called `venv` by running: + * Activate the virtualenv called `venv` by running: ``` source venv/bin/activate ``` @@ -100,13 +100,13 @@ Here are the steps for setting up `ricecooker` for local development: * For Windows systems: * Create a virtual env called `venv` in the current directory using the - following command: + following command: ``` virtualenv -p C:/Python36/python.exe venv ``` You may need to adjust the `-p` argument depending on where your version of Python is located. Note you'll need Python version 3.5 or higher. - * Activate the virtualenv called `venv` by running: + * Activate the virtualenv called `venv` by running: ``` .\venv\Scripts\activate ``` diff --git a/README.md b/README.md index 7793bd22..ff3693ea 100644 --- a/README.md +++ b/README.md @@ -2,11 +2,11 @@ ricecooker ========== [![PyPI pyversions](https://img.shields.io/pypi/pyversions/ricecooker.svg)](https://pypi.python.org/pypi/ricecooker/) [![build](https://github.com/learningequality/ricecooker/actions/workflows/pythontest.yml/badge.svg?branch=master)](https://github.com/learningequality/ricecooker/actions) -[![docs](https://readthedocs.org/projects/ricecooker/badge/?version=latest&style=flat)](https://ricecooker.readthedocs.io/) +[![docs](https://readthedocs.org/projects/ricecooker/badge/?version=latest&style=flat)](https://ricecooker.readthedocs.io/) The `ricecooker` library is a framework for automating the conversion of educational content into -Kolibri content channels and uploading them to [Kolibri Studio](https://studio.learningequality.org/), +Kolibri content channels and uploading them to [Kolibri Studio](https://studio.learningequality.org/), which is the central content server for [Kolibri](http://learningequality.org/kolibri/). @@ -36,7 +36,7 @@ The basic process of getting new content into Kolibri is as follows: The diagram below illustrates how content flows within the Kolibri ecosystem and highlights the part which is covered by the `ricecooker` framework (bottom left). -![Overview of steps for integrating external content sources for use in the Kolibri Learning Platform](docs/figures/content_pipeline_diagram_with_highlight.png) +![Overview of steps for integrating external content sources for use in the Kolibri Learning Platform](docs/figures/content_pipeline_diagram_with_highlight.png) *External content sources (left) are first uploaded to [Kolibri Studio](https://studio.learningequality.org/) (middle), so they can be used in the [Kolibri Learning Platform](http://learningequality.org/kolibri/) (right).* @@ -48,7 +48,7 @@ place to learn about writing automated content integration scripts. Here are some links to other documents and guides you can read to learn about the other parts of the Kolibri content platform: - + - The [Kolibri Content Integration Guide](https://learningequality.org/r/integration-guide) is a comprehensive guide to the decisions, processes, and tools for integrating external content sources for use in the Kolibri Learning Platform. diff --git a/docs/README.rst b/docs/README.rst index 5b4e93bc..caa812e1 100644 --- a/docs/README.rst +++ b/docs/README.rst @@ -23,4 +23,3 @@ License :alt: Creative Commons License This work is licensed under a `Creative Commons Attribution-ShareAlike 4.0 International License `__ - diff --git a/docs/_templates/layout.html b/docs/_templates/layout.html index 4e46b4a9..94deb598 100644 --- a/docs/_templates/layout.html +++ b/docs/_templates/layout.html @@ -17,7 +17,7 @@ /* div { border: 1px solid red; } */ - + .row {clear: both} @media (min-width: 800px) { @@ -38,7 +38,7 @@ float: left; } - + .column3 { width: 33.3%; font-size: 80%; diff --git a/docs/chefops.md b/docs/chefops.md index 5a72d53e..b25e15f8 100644 --- a/docs/chefops.md +++ b/docs/chefops.md @@ -141,4 +141,3 @@ time to complete so it is best to run them on a dedicated server for this purpos e.g. `./sushichef.py --token=... --thumbnails lang=fr`. - By default `nohup` logs stderr and stdout output to a file called `nohup.out` in the current working directory. Use `tail -f nohup.out` to follow this log file. - diff --git a/docs/community/authors.rst b/docs/community/authors.rst index 49689011..ab8224b5 120000 --- a/docs/community/authors.rst +++ b/docs/community/authors.rst @@ -1 +1 @@ -../../AUTHORS.rst \ No newline at end of file +../../AUTHORS.rst diff --git a/docs/community/contributing.md b/docs/community/contributing.md index f939e75f..bcac999a 120000 --- a/docs/community/contributing.md +++ b/docs/community/contributing.md @@ -1 +1 @@ -../../CONTRIBUTING.md \ No newline at end of file +../../CONTRIBUTING.md diff --git a/docs/concepts/reviewing_channels.md b/docs/concepts/reviewing_channels.md index 7c0df4d3..e826d7d0 100644 --- a/docs/concepts/reviewing_channels.md +++ b/docs/concepts/reviewing_channels.md @@ -56,4 +56,3 @@ You can. Whenever you need a distraction, take 20 minutes and place yourself in the learner's shoes and go explore the channel on the demo server link provided on the notion card. If you notice any issues while browsing, add them to the Issue tracker table. That's it. Learn something today. - diff --git a/docs/concepts/terminology.md b/docs/concepts/terminology.md index 24f94275..03cacbc7 100644 --- a/docs/concepts/terminology.md +++ b/docs/concepts/terminology.md @@ -61,4 +61,3 @@ process to Kolibri Studio for review and publishing. Conceptually, `SushiChef` scripts are very similar to web scrapers, but with specialized functions for optimizing the content for Kolibri's data structures and capabilities. - diff --git a/docs/conf.py b/docs/conf.py index 0d1f0694..a682477d 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -12,7 +12,6 @@ # # All configuration values have a default; values that are commented out # serve to show the default. - # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. @@ -21,7 +20,8 @@ import os import sys from datetime import datetime -sys.path.insert(0, os.path.abspath('..')) + +sys.path.insert(0, os.path.abspath("..")) # from recommonmark.parser import CommonMarkParser from ricecooker import __version__ as current_ricecooker_version @@ -37,13 +37,13 @@ # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ - 'sphinx.ext.autodoc', - 'sphinx.ext.napoleon', - 'sphinx.ext.doctest', - 'sphinx.ext.intersphinx', - 'sphinx.ext.todo', - 'sphinx.ext.mathjax', - 'sphinx.ext.viewcode', + "sphinx.ext.autodoc", + "sphinx.ext.napoleon", + "sphinx.ext.doctest", + "sphinx.ext.intersphinx", + "sphinx.ext.todo", + "sphinx.ext.mathjax", + "sphinx.ext.viewcode", "recommonmark", "nbsphinx", "sphinx_rtd_theme", @@ -51,19 +51,19 @@ ] # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] # The encoding of source files. # # source_encoding = 'utf-8-sig' # The master toctree document. -master_doc = 'index' +master_doc = "index" # General information about the project. -project = 'ricecooker' -copyright = u'{year:d}, Learning Equality'.format(year=datetime.now().year) -author = 'Learning Equality Content Team' +project = "ricecooker" +copyright = u"{year:d}, Learning Equality".format(year=datetime.now().year) +author = "Learning Equality Content Team" # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the @@ -94,10 +94,10 @@ # directories to ignore when looking for source files. # This patterns also effect to html_static_path and html_extra_path exclude_patterns = [ - 'examples/.ipynb_checkpoints', - 'examples/drafts', - '_build', - 'build', + "examples/.ipynb_checkpoints", + "examples/drafts", + "_build", + "build", ] # The reST default role (used for this markup: `text`) to use for all @@ -120,7 +120,7 @@ # show_authors = False # The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' +pygments_style = "sphinx" # A list of ignored prefixes for module index sorting. # modindex_common_prefix = [] @@ -137,7 +137,7 @@ # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # -html_theme = 'sphinx_rtd_theme' +html_theme = "sphinx_rtd_theme" # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the @@ -160,17 +160,17 @@ # The name of an image file (relative to this directory) to place at the top # of the sidebar. # -html_logo = 'figures/kolibri_logo.png' +html_logo = "figures/kolibri_logo.png" # The name of an image file (within the static path) to use as favicon of the # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 # pixels large. -html_favicon = 'figures/logo.ico' +html_favicon = "figures/logo.ico" # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = [] # ['_static'] +html_static_path = [] # ['_static'] # Add any extra paths that contain custom files (such as robots.txt or # .htaccess) here, relative to this directory. These files are copied @@ -181,7 +181,7 @@ # bottom, using the given strftime format. # The empty string is equivalent to '%b %d, %Y'. # -#html_last_updated_fmt = '' +# html_last_updated_fmt = '' # If true, SmartyPants will be used to convert quotes and dashes to # typographically correct entities. @@ -249,7 +249,7 @@ # html_search_scorer = 'scorer.js' # Output file base name for HTML help builder. -htmlhelp_basename = 'ricecookerdoc' +htmlhelp_basename = "ricecookerdoc" # -- Options for LaTeX output --------------------------------------------- @@ -257,18 +257,18 @@ latex_elements = { # The paper size ('letterpaper' or 'a4paper'). - 'papersize': 'letterpaper', - 'fncychap': '\\usepackage{fncychap}', - 'fontpkg': '\\usepackage[default]{lato}\\usepackage[T1]{fontenc}', - 'figure_align':'htbp', + "papersize": "letterpaper", + "fncychap": "\\usepackage{fncychap}", + "fontpkg": "\\usepackage[default]{lato}\\usepackage[T1]{fontenc}", + "figure_align": "htbp", # The font size ('10pt', '11pt' or '12pt'). # - 'pointsize': '11pt', - 'extraclassoptions': 'oneside', + "pointsize": "11pt", + "extraclassoptions": "oneside", # The font size ('10pt', '11pt' or '12pt'). #'pointsize': '10pt', # Additional stuff for the LaTeX preamble. - 'preamble': r''' + "preamble": r""" %%% FRONTMATTER %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % %%%add number to subsubsection 2=subsection, 3=subsubsection @@ -346,9 +346,8 @@ \newunicodechar{🍣}{sushi} \newunicodechar{🍱}{lunchbox} - ''', - - 'maketitle': r''' + """, + "maketitle": r""" %%% SET PDF INFO %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Note: this has to be after preamble where \title and \author are defined @@ -404,21 +403,20 @@ % \listoftables \clearpage \pagenumbering{arabic} - ''', + """, # Latex figure (float) alignment # 'figure_align': 'htbp', # # GLOBAL OPTIONS FOR THE sphynx.sty STYLE CLASS ############################ - 'sphinxsetup': \ - 'hmargin={1in,1in}, vmargin={1.2in,0.7in}, \ + "sphinxsetup": "hmargin={1in,1in}, vmargin={1.2in,0.7in}, \ TitleColor={rgb}{0,0,0}, \ InnerLinkColor={rgb}{0,0,1}, \ OuterLinkColor={rgb}{0,0,1}, \ verbatimwithframe=false, \ VerbatimColor={rgb}{0.95,0.95,0.95}, \ verbatimvisiblespace={}, \ - verbatimcontinued={}', - 'tableofcontents':' ', + verbatimcontinued={}", + "tableofcontents": " ", } # other sphinxsetup options: # verbatimwithframe=true, \ @@ -439,36 +437,16 @@ ############################################################################ - - - - - - - - - - - - - - - - - - - - - # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ - (master_doc, # source start file, - 'ricecooker.tex', # target name, - 'ricecooker docs', # title, - 'Learning Equality Content Team', # author, - 'manual' # documentclass [howto, manual, or own class]) + ( + master_doc, # source start file, + "ricecooker.tex", # target name, + "ricecooker docs", # title, + "Learning Equality Content Team", # author, + "manual", # documentclass [howto, manual, or own class]) ), ] @@ -476,7 +454,7 @@ # The name of an image file (relative to this directory) to place at the top of # the title page. # -latex_logo = 'figures/content_pipeline_diagram_with_highlight.png' +latex_logo = "figures/content_pipeline_diagram_with_highlight.png" # For "manual" documents, if this is true, then toplevel headings are parts, # not chapters. @@ -510,12 +488,7 @@ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). -man_pages = [ - (master_doc, - 'ricecooker', - 'ricecooker Documentation', - [author], 1) -] +man_pages = [(master_doc, "ricecooker", "ricecooker Documentation", [author], 1)] # If true, show URL addresses after external links. # @@ -528,13 +501,15 @@ # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - (master_doc, - 'ricecooker', - 'ricecooker Documentation', - author, - 'ricecooker', - 'One line description of project.', - 'Miscellaneous'), + ( + master_doc, + "ricecooker", + "ricecooker Documentation", + author, + "ricecooker", + "One line description of project.", + "Miscellaneous", + ), ] # Documents to append as an appendix to all manuals. @@ -547,7 +522,7 @@ # How to display URL addresses: 'footnote', 'no', or 'inline'. # -texinfo_show_urls = 'inline' +texinfo_show_urls = "inline" # If true, do not generate a @detailmenu in the "Top" node's menu. # @@ -608,7 +583,7 @@ # epub_post_files = [] # A list of files that should not be packed into the epub file. -epub_exclude_files = ['search.html'] +epub_exclude_files = ["search.html"] # The depth of the table of contents in toc.ncx. # @@ -642,22 +617,22 @@ # Configuration for intersphinx for various LE projects intersphinx_mapping = { - 'python': ('https://docs.python.org/3.6/', None), - 'django': ('https://django.readthedocs.io/en/latest/', None), - 'kolibri-user': ('http://kolibri.readthedocs.io/en/latest/', None), - 'kolibri': ('http://kolibri-dev.readthedocs.io/en/latest/', None), - 'studio-user': ('https://kolibri-studio.readthedocs.io/en/latest/', None), + "python": ("https://docs.python.org/3.6/", None), + "django": ("https://django.readthedocs.io/en/latest/", None), + "kolibri-user": ("http://kolibri.readthedocs.io/en/latest/", None), + "kolibri": ("http://kolibri-dev.readthedocs.io/en/latest/", None), + "studio-user": ("https://kolibri-studio.readthedocs.io/en/latest/", None), } # Also accept .md files (via https://github.com/rtfd/recommonmark) -source_suffix = ['.md', '.rst', '.ipynb'] +source_suffix = [".md", ".rst", ".ipynb"] autodoc_default_options = { - # Make sure that any autodoc declarations show the right members - "members": None, - "inherited-members": None, - "undoc-members": None, - #"private-members": True, - "show-inheritance": None, + # Make sure that any autodoc declarations show the right members + "members": None, + "inherited-members": None, + "undoc-members": None, + # "private-members": True, + "show-inheritance": None, } diff --git a/docs/developer/corrections.md b/docs/developer/corrections.md index 0e68a164..21405da5 100644 --- a/docs/developer/corrections.md +++ b/docs/developer/corrections.md @@ -1,6 +1,6 @@ Studio bulk corrections ======================= -The command line script `corrections` allows to perform bulk corrections of +The command line script `corrections` allows to perform bulk corrections of titles, descriptions, and other attributes for the content nodes of a channel. @@ -83,7 +83,7 @@ document that contains the corrections (usually `=0`). The attributes that will be edited during the `modify` operation is specified using the `--modifyattrs` command line argument. For example to apply modifications -only to the `title` and `description` attributes use the following command: +only to the `title` and `description` attributes use the following command: corrections apply --gsheet_id='' --gid= --modifyattrs='title,description' diff --git a/docs/developer/ids.md b/docs/developer/ids.md index d96ca124..e437fe12 100644 --- a/docs/developer/ids.md +++ b/docs/developer/ids.md @@ -46,7 +46,7 @@ Content nodes within the Kolibri ecosystem have the following identifiers: and the content node's `source_id` used for tracking a user interactions with the content node (e.g. video watched, or exercise completed). -When a particular piece of content appears in multiple channels, or in different +When a particular piece of content appears in multiple channels, or in different places within a tree, the `node_id` of each occurrence will be different, but the `content_id` of each item will be the same for all copies. In other words, the `content_id` keeps track of the "is identical to" information about content nodes. @@ -115,4 +115,3 @@ has content overlap with items in another channel, you must look into how it com source_domain and source_id and use the same approach to get matching `content_id`s. This cheffing-time deduplication effort is worth investing in, because it makes possible all the applications described above. - diff --git a/docs/developer/index.rst b/docs/developer/index.rst index b1af2a2d..40a6126f 100644 --- a/docs/developer/index.rst +++ b/docs/developer/index.rst @@ -13,4 +13,3 @@ To learn about the inner workings of the ``ricecooker`` library, consult the fol uploadprocess design_cli corrections - diff --git a/docs/developer/kolibripreview.md b/docs/developer/kolibripreview.md index f4363338..681cfbb8 100644 --- a/docs/developer/kolibripreview.md +++ b/docs/developer/kolibripreview.md @@ -41,7 +41,7 @@ Prerequisites 4. Download the helper script `kolibripreview.py` and make it executable: ```bash wget https://raw.githubusercontent.com/learningequality/ricecooker/master/ricecooker/utils/kolibripreview.py - chmod +x kolibripreview.py + chmod +x kolibripreview.py ``` @@ -105,7 +105,7 @@ python kolibri-0.13.2.pex manage importcontent network 0413dd5173014d33b5a98a8c0 python kolibri-0.13.2.pex start --foreground ``` -After that you can use the script as usual: +After that you can use the script as usual: 1. Replace placeholder .zip with contents of `webroot`: ```bash diff --git a/docs/developer/uploadprocess.md b/docs/developer/uploadprocess.md index c824f436..ce852131 100644 --- a/docs/developer/uploadprocess.md +++ b/docs/developer/uploadprocess.md @@ -84,7 +84,7 @@ functions that currently support PDF, ePub, HTML5, mp3 files, and videos. File diff --------- Ricecooker then sends the list of filenames (using the content-hash based names) -to Studio to check which files are already present. +to Studio to check which files are already present. ```python get_file_diff(tree, files_to_diff) @@ -136,7 +136,7 @@ like nodes modified/added/removed and the total storage space requirements. Deploying the channel (optional) --------------------------------- +-------------------------------- Studio channel editors can use the `DEPLOY` button in the Studio web interface to activate the "draft copy" and make it visible to all Studio users. This is implemented by replacing the channel's `main` tree with the `staging` tree. @@ -146,16 +146,10 @@ During [this step](https://github.com/learningequality/studio/blob/5564c1fc540d8 Publish channel (optional) -------------------------- The `PUBLISH` channel button on Studio is used to save and export a new version of the channel. -The PUBLISH action exports all the channel metadata to a sqlite3 DB file served +The PUBLISH action exports all the channel metadata to a sqlite3 DB file served by Studio at the URL `/content/{{channel_id}}.sqlite3` and ensure the associated -files exist in `/content/storage/` which is served by a CDN. +files exist in `/content/storage/` which is served by a CDN. This step is a prerequisite for getting the channel out of Studio and into Kolibri. The combination of `{{channel_id}}.sqlite3` file and the files in `/content/storage` define the Kolibri Channels content format. This is what gets exported to the folder `KOLIBRI_DATA` on sdcard or external drives when you use the `EXPORT` action in Kolibri. - - - - - - diff --git a/docs/downloader.md b/docs/downloader.md index 04d3584d..1902782f 100644 --- a/docs/downloader.md +++ b/docs/downloader.md @@ -124,5 +124,3 @@ Further reading --------------- - Tutorial on the Python [requests module](https://stackabuse.com/the-python-requests-module/). - - diff --git a/docs/examples/detokenify.pl b/docs/examples/detokenify.pl index 3ccb16fc..37d95f33 100755 --- a/docs/examples/detokenify.pl +++ b/docs/examples/detokenify.pl @@ -10,4 +10,3 @@ s/a5c5fb[\da-f]{34}/YOURTOKENHERE9139139f3a23232/g; s/70aec3[\da-f]{34}/YOURTOKENHERE9139139f3a23232/g; s/563554[\da-f]{34}/YOURTOKENHERE9139139f3a23232/g; - diff --git a/docs/examples/exercises.ipynb b/docs/examples/exercises.ipynb index cbb9330e..1c064878 100644 --- a/docs/examples/exercises.ipynb +++ b/docs/examples/exercises.ipynb @@ -232,4 +232,4 @@ }, "nbformat": 4, "nbformat_minor": 2 -} \ No newline at end of file +} diff --git a/docs/examples/languages.ipynb b/docs/examples/languages.ipynb index c231667e..87421145 100644 --- a/docs/examples/languages.ipynb +++ b/docs/examples/languages.ipynb @@ -540,4 +540,4 @@ }, "nbformat": 4, "nbformat_minor": 2 -} \ No newline at end of file +} diff --git a/docs/files.md b/docs/files.md index 963f29cb..39286891 100644 --- a/docs/files.md +++ b/docs/files.md @@ -248,4 +248,3 @@ recommend that long PDF documents be split into separate parts. Note: Kolibri Studio imposes a file storage quota on a per-user basis. By default the storage limit for new accounts is 500MB. Please get in touch with the content team by email (`content@le...`) if you need a quota increase. - diff --git a/docs/history.rst b/docs/history.rst index b12bb0c6..f06b508a 100644 --- a/docs/history.rst +++ b/docs/history.rst @@ -190,4 +190,3 @@ History 0.1.0 (2016-09-30) ------------------ * First release on PyPI. - diff --git a/docs/htmlapps.md b/docs/htmlapps.md index 0c5bb8a3..cbaff9b9 100644 --- a/docs/htmlapps.md +++ b/docs/htmlapps.md @@ -240,7 +240,7 @@ with HTMLWriter('./myzipfile.zip') as zipper: # Add your code here ``` -To write the main file (`index.html` in the root of the zip file), use the +To write the main file (`index.html` in the root of the zip file), use the `write_index_contents` method: ```python contents = "Hello, World!" @@ -283,4 +283,3 @@ Further reading - The new H5P content format (experimental support) is also conceptually similar but contains much more structure and metadata about the javascript libraries that are used. See `H5PAppNode` and the `H5PFile` for more info. - diff --git a/docs/parsing_html.md b/docs/parsing_html.md index 380dc834..538704a7 100644 --- a/docs/parsing_html.md +++ b/docs/parsing_html.md @@ -91,4 +91,3 @@ There are also some excellent tutorials online you can read: - [http://akul.me/blog/2016/beautifulsoup-cheatsheet/](http://akul.me/blog/2016/beautifulsoup-cheatsheet/) - [http://youkilljohnny.blogspot.com/2014/03/beautifulsoup-cheat-sheet-parse-html-by.html](http://youkilljohnny.blogspot.com/2014/03/beautifulsoup-cheat-sheet-parse-html-by.html) - [http://www.compjour.org/warmups/govt-text-releases/intro-to-bs4-lxml-parsing-wh-press-briefings/](http://www.compjour.org/warmups/govt-text-releases/intro-to-bs4-lxml-parsing-wh-press-briefings/) - diff --git a/docs/pdfutils.md b/docs/pdfutils.md index dd833181..5ab73da3 100644 --- a/docs/pdfutils.md +++ b/docs/pdfutils.md @@ -22,17 +22,17 @@ Here is how to split a PDF document located at `pdf_path`, which can be either a local path or a URL: from ricecooker.utils.pdf import PDFParser - + pdf_path = '/some/local/doc.pdf' or 'https://somesite.org/some/remote/doc.pdf' with PDFParser(pdf_path) as pdfparser: chapters = pdfparser.split_chapters() The output `chapters` is list of dictionaries with `title` and `path` attributes: - [ + [ {'title':'First chapter', 'path':'downloads/doc/First-chapter.pdf'}, {'title':'Second chapter', 'path':'downloads/doc/Second-chapter.pdf'}, - ... + ... ] Use this information to create an individual `DocumentNode` for each PDF and store @@ -67,7 +67,7 @@ has the following format: If the page ranges automatically detected form the PDF's table of contents are not suitable for the document you're processing, or if the PDF document does not -contain table of contents information, you can manually create the title and +contain table of contents information, you can manually create the title and page range data and pass it as the `jsondata` argument to the `split_chapters()`. page_ranges = pdfparser.get_toc() @@ -140,4 +140,3 @@ Accessibility notes ------------------- Do not use `PDFParser` for tagged PDFs because splitting and processing loses the accessibility features of the original PDF document. - diff --git a/docs/tutorial/jiro.md b/docs/tutorial/jiro.md index 3fd36c6a..e3e7b50f 100644 --- a/docs/tutorial/jiro.md +++ b/docs/tutorial/jiro.md @@ -54,5 +54,3 @@ with that name. Otherwise, it will upload to production Studio. If you have never registered an API token for the Studio server you're uploading to, it may prompt you to enter it when running this command. - - diff --git a/docs/tutorial/quickstart.rst b/docs/tutorial/quickstart.rst index fde45117..17db7613 100644 --- a/docs/tutorial/quickstart.rst +++ b/docs/tutorial/quickstart.rst @@ -15,4 +15,3 @@ The following links will get you started with content integration process in no - For more info see the `ricecooker docs main page <../index.html>`_ 📚. Welcome to the team! - diff --git a/docs/video_compression.md b/docs/video_compression.md index a20af491..6c0c6b75 100644 --- a/docs/video_compression.md +++ b/docs/video_compression.md @@ -265,4 +265,3 @@ Here are the steps to preview different compression factors in Kolibri: - PUBLISH the channel and record the channel token - Import the channel into a Kolibri instance using the channel token - Test video playback on different devices (desktop and mobile browsers on all OSs) - diff --git a/examples/gettingstarted/sushichef.py b/examples/gettingstarted/sushichef.py index 09e70a7e..3fe1f419 100755 --- a/examples/gettingstarted/sushichef.py +++ b/examples/gettingstarted/sushichef.py @@ -1,8 +1,10 @@ #!/usr/bin/env python from ricecooker.chefs import SushiChef -from ricecooker.classes.nodes import ChannelNode, TopicNode, DocumentNode from ricecooker.classes.files import DocumentFile from ricecooker.classes.licenses import get_license +from ricecooker.classes.nodes import ChannelNode +from ricecooker.classes.nodes import DocumentNode +from ricecooker.classes.nodes import TopicNode class SimpleChef(SushiChef): diff --git a/examples/oldexamples/README.md b/examples/oldexamples/README.md index 1b952a40..c650ffd3 100644 --- a/examples/oldexamples/README.md +++ b/examples/oldexamples/README.md @@ -20,4 +20,3 @@ WARNING Exercise 6cafe3: http://www.publicdomainpictures.net/pictures/110000/n WARNING Question ddddd: ka-perseus-graphie.s3.amazonaws.com/907dec1b45fb177f0937fa521b7af03fb837f0bd [Errno 2] No such file or directory: 'ka-perseus-graphie.s3.amazonaws.com/907dec1b45fb177f0937fa521b7af03fb837f0bd.svg' ``` - diff --git a/examples/oldexamples/content/0a0c0f1a1a40226d8d227a07dd143f8c08a4b8a5.svg b/examples/oldexamples/content/0a0c0f1a1a40226d8d227a07dd143f8c08a4b8a5.svg index f6373f9e..c3e5e1c0 100644 --- a/examples/oldexamples/content/0a0c0f1a1a40226d8d227a07dd143f8c08a4b8a5.svg +++ b/examples/oldexamples/content/0a0c0f1a1a40226d8d227a07dd143f8c08a4b8a5.svg @@ -1 +1 @@ - \ No newline at end of file + diff --git a/examples/oldexamples/content/captions.vtt b/examples/oldexamples/content/captions.vtt index fd319c88..ae7b0408 100644 --- a/examples/oldexamples/content/captions.vtt +++ b/examples/oldexamples/content/captions.vtt @@ -31,4 +31,3 @@ bila ambayo moja wanders bure kwa njia ya labyrinth giza. " 00:00:35.933 --> 00:00:41.106 Sana lakini kwa kiasi kikubwa na kina kirefu sana na kwa kweli hii ni hatua ya hisabati - diff --git a/examples/oldexamples/large_wikipedia_chef.py b/examples/oldexamples/large_wikipedia_chef.py index d74cd5f5..2d45dae7 100755 --- a/examples/oldexamples/large_wikipedia_chef.py +++ b/examples/oldexamples/large_wikipedia_chef.py @@ -1,5 +1,4 @@ #!/usr/bin/env python - import tempfile import requests @@ -8,8 +7,12 @@ from ricecooker.chefs import SushiChef from ricecooker.classes import licenses from ricecooker.classes.files import HTMLZipFile -from ricecooker.classes.nodes import ChannelNode, HTML5AppNode, TopicNode -from ricecooker.utils.caching import CacheForeverHeuristic, FileCache, CacheControlAdapter +from ricecooker.classes.nodes import ChannelNode +from ricecooker.classes.nodes import HTML5AppNode +from ricecooker.classes.nodes import TopicNode +from ricecooker.utils.caching import CacheControlAdapter +from ricecooker.utils.caching import CacheForeverHeuristic +from ricecooker.utils.caching import FileCache from ricecooker.utils.html import download_file from ricecooker.utils.zip import create_predictable_zip @@ -17,15 +20,15 @@ SOURCE_DOMAIN = "" # SOURCE_ID = "" # an alphanumeric ID refering to this channel CHANNEL_TITLE = "" # a humand-readbale title -CHANNEL_LANGUAGE = "en" # language of channel +CHANNEL_LANGUAGE = "en" # language of channel sess = requests.Session() -cache = FileCache('.webcache') +cache = FileCache(".webcache") basic_adapter = CacheControlAdapter(cache=cache) forever_adapter = CacheControlAdapter(heuristic=CacheForeverHeuristic(), cache=cache) -sess.mount('http://', forever_adapter) -sess.mount('https://', forever_adapter) +sess.mount("http://", forever_adapter) +sess.mount("https://", forever_adapter) def make_fully_qualified_url(url): @@ -51,29 +54,33 @@ def get_parsed_html_from_url(url, *args, **kwargs): return BeautifulSoup(html, "html.parser") - class LargeWikipediaChef(SushiChef): """ The chef class that takes care of uploading channel to the content curation server. We'll call its `main()` method from the command line script. """ - channel_info = { # - 'CHANNEL_SOURCE_DOMAIN': SOURCE_DOMAIN, # who is providing the content (e.g. learningequality.org) - 'CHANNEL_SOURCE_ID': SOURCE_ID, # channel's unique id - 'CHANNEL_TITLE': CHANNEL_TITLE, - 'CHANNEL_LANGUAGE': CHANNEL_LANGUAGE, - 'CHANNEL_THUMBNAIL': 'https://lh3.googleusercontent.com/zwwddqxgFlP14DlucvBV52RUMA-cV3vRvmjf-iWqxuVhYVmB-l8XN9NDirb0687DSw=w300', # (optional) local path or url to image file - 'CHANNEL_DESCRIPTION': 'A large channel created from Wikipedia content.', # (optional) description of the channel (optional) + + channel_info = { # + "CHANNEL_SOURCE_DOMAIN": SOURCE_DOMAIN, # who is providing the content (e.g. learningequality.org) + "CHANNEL_SOURCE_ID": SOURCE_ID, # channel's unique id + "CHANNEL_TITLE": CHANNEL_TITLE, + "CHANNEL_LANGUAGE": CHANNEL_LANGUAGE, + "CHANNEL_THUMBNAIL": "https://lh3.googleusercontent.com/zwwddqxgFlP14DlucvBV52RUMA-cV3vRvmjf-iWqxuVhYVmB-l8XN9NDirb0687DSw=w300", # (optional) local path or url to image file + "CHANNEL_DESCRIPTION": "A large channel created from Wikipedia content.", # (optional) description of the channel (optional) } def construct_channel(self, *args, **kwargs): """ Create ChannelNode and build topic tree. """ - channel = self.get_channel(*args, **kwargs) # creates ChannelNode from data in self.channel_info + channel = self.get_channel( + *args, **kwargs + ) # creates ChannelNode from data in self.channel_info city_topic = TopicNode(source_id="List_of_largest_cities", title="Cities!") channel.add_child(city_topic) - add_subpages_from_wikipedia_list(city_topic, "https://en.wikipedia.org/wiki/List_of_largest_cities") + add_subpages_from_wikipedia_list( + city_topic, "https://en.wikipedia.org/wiki/List_of_largest_cities" + ) return channel @@ -87,7 +94,11 @@ def add_subpages_from_wikipedia_list(topic, list_url): page = get_parsed_html_from_url(list_url) # extract the main table from the page - table = page.find(lambda tag: tag.name == 'table' and tag.has_attr('class') and 'wikitable' in tag['class']) + table = page.find( + lambda tag: tag.name == "table" + and tag.has_attr("class") + and "wikitable" in tag["class"] + ) # loop through all the rows in the table for row in table.find_all("tr"): @@ -117,7 +128,9 @@ def add_subpages_from_wikipedia_list(topic, list_url): # attempt to extract a thumbnail for the subpage, from the second column in the table image = columns[1].find("img") thumbnail_url = make_fully_qualified_url(image["src"]) if image else None - if thumbnail_url and not (thumbnail_url.endswith("jpg") or thumbnail_url.endswith("png")): + if thumbnail_url and not ( + thumbnail_url.endswith("jpg") or thumbnail_url.endswith("png") + ): thumbnail_url = None # download the wikipedia page into an HTML5 app node @@ -159,14 +172,15 @@ def process_wikipedia_page(content, baseurl, destpath, **kwargs): page = BeautifulSoup(content, "html.parser") for image in page.find_all("img"): - relpath, _ = download_file(make_fully_qualified_url(image["src"]), destpath, request_fn=make_request) + relpath, _ = download_file( + make_fully_qualified_url(image["src"]), destpath, request_fn=make_request + ) image["src"] = relpath return str(page) - -if __name__ == '__main__': +if __name__ == "__main__": """ This code will run when the sushi chef is called from the command line. """ diff --git a/examples/oldexamples/sample_program.py b/examples/oldexamples/sample_program.py index fe78f384..c71bb6d7 100755 --- a/examples/oldexamples/sample_program.py +++ b/examples/oldexamples/sample_program.py @@ -1,30 +1,41 @@ #!/usr/bin/env python - -from enum import Enum import json import os -from os.path import join import re +from enum import Enum +from os.path import join + +from le_utils.constants import content_kinds +from le_utils.constants import exercises +from le_utils.constants import file_formats +from le_utils.constants import format_presets +from le_utils.constants import languages +from le_utils.constants import licenses from ricecooker.chefs import SushiChef -from ricecooker.classes import nodes, questions, files +from ricecooker.classes import files +from ricecooker.classes import nodes +from ricecooker.classes import questions from ricecooker.classes.licenses import get_license -from ricecooker.exceptions import UnknownContentKindError, UnknownFileTypeError, UnknownQuestionTypeError, InvalidFormatException, raise_for_invalid_channel -from le_utils.constants import content_kinds,file_formats, format_presets, licenses, exercises, languages +from ricecooker.exceptions import InvalidFormatException +from ricecooker.exceptions import raise_for_invalid_channel +from ricecooker.exceptions import UnknownContentKindError +from ricecooker.exceptions import UnknownFileTypeError +from ricecooker.exceptions import UnknownQuestionTypeError from ricecooker.utils.encodings import get_base64_encoding # CHANNEL SETTINGS -SOURCE_DOMAIN = "" # content provider's domain -SOURCE_ID = "" # an alphanumeric channel ID -CHANNEL_TITLE = "Testing Ricecooker Channel" # a humand-readbale title -CHANNEL_LANGUAGE = "en" # language code of channel +SOURCE_DOMAIN = "" # content provider's domain +SOURCE_ID = "" # an alphanumeric channel ID +CHANNEL_TITLE = "Testing Ricecooker Channel" # a humand-readbale title +CHANNEL_LANGUAGE = "en" # language code of channel # LOCAL DIRS EXAMPLES_DIR = os.path.dirname(os.path.realpath(__file__)) -DATA_DIR = os.path.join(EXAMPLES_DIR, 'data') -CONTENT_DIR = os.path.join(EXAMPLES_DIR, 'content') +DATA_DIR = os.path.join(EXAMPLES_DIR, "data") +CONTENT_DIR = os.path.join(EXAMPLES_DIR, "content") # # A utility function to manage absolute paths that allows us to refer to files # in the CONTENT_DIR (subdirectory `content/' in current directory) using content:// @@ -34,21 +45,21 @@ def get_abspath(path, content_dir=CONTENT_DIR): By default looks for content in subdirectory `content` in current directory. """ if path: - file = re.search('content://(.+)', path) + file = re.search("content://(.+)", path) if file: return os.path.join(content_dir, file.group(1)) return path - class FileTypes(Enum): - """ Enum containing all file types Ricecooker can have + """Enum containing all file types Ricecooker can have - Steps: - AUDIO_FILE: mp3 files - THUMBNAIL: png, jpg, or jpeg files - DOCUMENT_FILE: pdf files + Steps: + AUDIO_FILE: mp3 files + THUMBNAIL: png, jpg, or jpeg files + DOCUMENT_FILE: pdf files """ + AUDIO_FILE = 0 THUMBNAIL = 1 DOCUMENT_FILE = 2 @@ -67,51 +78,50 @@ class FileTypes(Enum): FILE_TYPE_MAPPING = { - content_kinds.AUDIO : { - file_formats.MP3 : FileTypes.AUDIO_FILE, - file_formats.PNG : FileTypes.THUMBNAIL, - file_formats.JPG : FileTypes.THUMBNAIL, - file_formats.JPEG : FileTypes.THUMBNAIL, + content_kinds.AUDIO: { + file_formats.MP3: FileTypes.AUDIO_FILE, + file_formats.PNG: FileTypes.THUMBNAIL, + file_formats.JPG: FileTypes.THUMBNAIL, + file_formats.JPEG: FileTypes.THUMBNAIL, }, - content_kinds.DOCUMENT : { - file_formats.PDF : FileTypes.DOCUMENT_FILE, - file_formats.PNG : FileTypes.THUMBNAIL, - file_formats.JPG : FileTypes.THUMBNAIL, - file_formats.JPEG : FileTypes.THUMBNAIL, + content_kinds.DOCUMENT: { + file_formats.PDF: FileTypes.DOCUMENT_FILE, + file_formats.PNG: FileTypes.THUMBNAIL, + file_formats.JPG: FileTypes.THUMBNAIL, + file_formats.JPEG: FileTypes.THUMBNAIL, }, - content_kinds.HTML5 : { - file_formats.HTML5 : FileTypes.HTML_ZIP_FILE, - file_formats.PNG : FileTypes.THUMBNAIL, - file_formats.JPG : FileTypes.THUMBNAIL, - file_formats.JPEG : FileTypes.THUMBNAIL, + content_kinds.HTML5: { + file_formats.HTML5: FileTypes.HTML_ZIP_FILE, + file_formats.PNG: FileTypes.THUMBNAIL, + file_formats.JPG: FileTypes.THUMBNAIL, + file_formats.JPEG: FileTypes.THUMBNAIL, }, - content_kinds.H5P : { - file_formats.H5P : FileTypes.H5P_FILE, - file_formats.PNG : FileTypes.THUMBNAIL, - file_formats.JPG : FileTypes.THUMBNAIL, - file_formats.JPEG : FileTypes.THUMBNAIL, + content_kinds.H5P: { + file_formats.H5P: FileTypes.H5P_FILE, + file_formats.PNG: FileTypes.THUMBNAIL, + file_formats.JPG: FileTypes.THUMBNAIL, + file_formats.JPEG: FileTypes.THUMBNAIL, }, - content_kinds.VIDEO : { - file_formats.MP4 : FileTypes.VIDEO_FILE, - file_formats.VTT : FileTypes.SUBTITLE_FILE, - file_formats.PNG : FileTypes.THUMBNAIL, - file_formats.JPG : FileTypes.THUMBNAIL, - file_formats.JPEG : FileTypes.THUMBNAIL, + content_kinds.VIDEO: { + file_formats.MP4: FileTypes.VIDEO_FILE, + file_formats.VTT: FileTypes.SUBTITLE_FILE, + file_formats.PNG: FileTypes.THUMBNAIL, + file_formats.JPG: FileTypes.THUMBNAIL, + file_formats.JPEG: FileTypes.THUMBNAIL, }, - content_kinds.EXERCISE : { - file_formats.PNG : FileTypes.THUMBNAIL, - file_formats.JPG : FileTypes.THUMBNAIL, - file_formats.JPEG : FileTypes.THUMBNAIL, + content_kinds.EXERCISE: { + file_formats.PNG: FileTypes.THUMBNAIL, + file_formats.JPG: FileTypes.THUMBNAIL, + file_formats.JPEG: FileTypes.THUMBNAIL, }, } - def guess_file_type(kind, filepath=None, youtube_id=None, web_url=None, encoding=None): - """ guess_file_class: determines what file the content is - Args: - filepath (str): filepath of file to check - Returns: string indicating file's class + """guess_file_class: determines what file the content is + Args: + filepath (str): filepath of file to check + Returns: string indicating file's class """ if youtube_id: return FileTypes.YOUTUBE_VIDEO_FILE @@ -125,11 +135,12 @@ def guess_file_type(kind, filepath=None, youtube_id=None, web_url=None, encoding return FILE_TYPE_MAPPING[kind][ext] return None + def guess_content_kind(path=None, web_video_data=None, questions=None): - """ guess_content_kind: determines what kind the content is - Args: - files (str or list): files associated with content - Returns: string indicating node's kind + """guess_content_kind: determines what kind the content is + Args: + files (str or list): files associated with content + Returns: string indicating node's kind """ # If there are any questions, return exercise if questions and len(questions) > 0: @@ -140,20 +151,23 @@ def guess_content_kind(path=None, web_video_data=None, questions=None): ext = os.path.splitext(path)[1][1:].lower() if ext in content_kinds.MAPPING: return content_kinds.MAPPING[ext] - raise InvalidFormatException("Invalid file type: Allowed formats are {0}".format([key for key, value in content_kinds.MAPPING.items()])) + raise InvalidFormatException( + "Invalid file type: Allowed formats are {0}".format( + [key for key, value in content_kinds.MAPPING.items()] + ) + ) elif web_video_data: return content_kinds.VIDEO else: return content_kinds.TOPIC - # LOAD sample_tree.json (as dict) -with open(join(DATA_DIR,'sample_tree.json'),'r') as json_file: +with open(join(DATA_DIR, "sample_tree.json"), "r") as json_file: SAMPLE_TREE = json.load(json_file) # LOAD JSON DATA (as string) FOR PERSEUS QUESTIONS -SAMPLE_PERSEUS_1_JSON = open(join(DATA_DIR,'sample_perseus01.json'),'r').read() +SAMPLE_PERSEUS_1_JSON = open(join(DATA_DIR, "sample_perseus01.json"), "r").read() # SAMPLE_PERSEUS_2_JSON = open(join(DATA_DIR,'sample_perseus02.json'),'r').read() # ADD EXERCISES @@ -196,34 +210,40 @@ def guess_content_kind(path=None, web_video_data=None, questions=None): { "id": "eeeee", "question": "Which rice is your favorite? \\_\\_\\_ ![](data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAMAAAAoLQ9TAAAABGdBTUEAALGPC/xhBQAAAAFzUkdCAK7OHOkAAAAgY0hSTQAAeiYAAICEAAD6AAAAgOgAAHUwAADqYAAAOpgAABdwnLpRPAAAAmFQTFRF////wN/2I0FiNFFuAAAAxdvsN1RxV3KMnrPFFi9PAB1CVG+KXHaQI0NjttLrEjVchIF4AyNGZXB5V087UUw/EzBMpqWeb2thbmpgpqOceXVsERgfTWeADg8QCAEApKGZBAYIop+XCQkIhZ+2T2mEg5mtnK/AobPDkKO2YXqTAAAAJkBetMraZH2VprjIz9zm4enw7/T47fP3wc7ae5GnAAAAN1BsSmSApLfI1ODq2OHp5Orv8PL09vb38fb5wM/bbISbrL/PfZSpxNPgzdnj2+Pr5evw6+/z6e3w3ePp2OPsma2/ABM5Q197ABk4jKG1yNfjytfh1uDo3eXs4unv1t/nztrjqbzMTmmEXneRES1Ji6CzxtXixdPfztrk1N/n1+Dp1d/oz9vlxdPeq73NVG+KYnyUAAAddIuhwtPhvMzaxtTgytfiy9jjwtHewtHenbDCHT1fS2eCRV52qr7PvM3cucrYv87cv8/cvMzavc3bucvacoyl////ByE8WnKKscXWv9Hguszbu8zbvc7dtcnaiJqrcHZ4f4SHEh0nEitFTWZ+hJqumrDDm7HDj6W5dI2lYGJfmZeQl5SNAAAADRciAAATHjdSOVNsPlhyLklmKCYjW1lUlpOLlZKLFSAqWXSOBQAADA0NAAAAHh0bWlhSk5CIk5CIBAYJDRQbERcdDBAUBgkMAAAEDg4NAAAAHBsZWFZQkY6GAAAAAAAABQUEHBsZAAAAGxoYVlROko+GBAQDZ2RdAAAAGhkYcW9oAgICAAAAExMSDQwLjouDjYuDioiAiIV9hoN7VlRO////Z2DcYwAAAMR0Uk5TAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACRKrJyrZlBQECaNXCsKaqypMGAUDcu7Gpn5mf03gDo8+4saiipKq3xRMBH83Eu7OsqbG61DkDMdbFvrizsbK3wNs9Ax/VysS/vLq/zNwfArDhxMfExMXE3pMCMe7byMjIzd33ZgYGQtnz6+zooeJXBQMFD1yHejZ1+l8FBgEELlOR+GgFCQ0SGxoBGFKg+m0BBwEMR6v+hAEDM6nRASWURVuYQQ4AAAABYktHRACIBR1IAAAACXBIWXMAAAjLAAAIywGEuOmJAAABCklEQVQY02NgUGZUUVVT19DUYtBmYmZhYdBh1dXTNzA0MjYxZTFjAwqwm1tYWlnb2NrZO3A4cgIFGJycXVzd3D08vbx9uHyBAn7+AYFBwSEhoWHhEdyRQIGo6JjYuPiExKTklFSeNKBAekZmVnZObk5efkEhbxFQgK+4pLSsvKKyqrqGoZZfgIVBsK6+obGpuaW1rV2oQ1hEgKFTtKu7p7evf8LEI5PEJotLMEyZyjJt+oyZsxhmzzk6V3KeFIO01vwFMrJyCxctXrL02DL55QwsClorVq5avWbtuvUbNh7fpMjAwsKyWWvLFJatStu279h5YhdIAAJ2s+zZu+/kfoQAy4HNLAcPHQYA5YtSi+k2/WkAAAAldEVYdGRhdGU6Y3JlYXRlADIwMTMtMTAtMDRUMTk6Mzk6MjEtMDQ6MDAwU1uYAAAAJXRFWHRkYXRlOm1vZGlmeQAyMDEzLTEwLTA0VDE5OjM5OjIxLTA0OjAwQQ7jJAAAABl0RVh0U29mdHdhcmUAd3d3Lmlua3NjYXBlLm9yZ5vuPBoAAAAASUVORK5CYII=)", - "type":exercises.MULTIPLE_SELECTION, - "correct_answers": ["White rice", "Brown rice", "Sushi rice

abc

"], - "all_answers": ["White rice", "Quinoa","Brown rice", "<"], + "type": exercises.MULTIPLE_SELECTION, + "correct_answers": [ + "White rice", + "Brown rice", + "Sushi rice

abc

", + ], + "all_answers": ["White rice", "Quinoa", "Brown rice", "<"], }, { "id": "bbbbb", "question": "Which rice is the crunchiest?", - "type":exercises.SINGLE_SELECTION, + "type": exercises.SINGLE_SELECTION, "correct_answer": "Rice Krispies \n![](https://upload.wikimedia.org/wikipedia/commons/c/cd/RKTsquares.jpg)", "all_answers": [ "White rice", "Brown rice \n![](https://c2.staticflickr.com/4/3159/2889140143_b99fd8dd4c_z.jpg?zz=1)", - "Rice Krispies \n![](https://upload.wikimedia.org/wikipedia/commons/c/cd/RKTsquares.jpg)" + "Rice Krispies \n![](https://upload.wikimedia.org/wikipedia/commons/c/cd/RKTsquares.jpg)", ], "hints": "It's delicious", }, { - "id": "aaaaa", "question": "How many minutes does it take to cook rice? ", - "type":exercises.INPUT_QUESTION, + "type": exercises.INPUT_QUESTION, "answers": ["20", "25", "15"], - "hints": ["Takes roughly same amount of time to install kolibri on Windows machine", "Does this help?\n![](http://www.aroma-housewares.com/images/rice101/delay_timer_1.jpg)"], + "hints": [ + "Takes roughly same amount of time to install kolibri on Windows machine", + "Does this help?\n![](http://www.aroma-housewares.com/images/rice101/delay_timer_1.jpg)", + ], }, { "id": "ddddd", - "type":exercises.PERSEUS_QUESTION, - "item_data":SAMPLE_PERSEUS_1_JSON, + "type": exercises.PERSEUS_QUESTION, + "item_data": SAMPLE_PERSEUS_1_JSON, }, ], }, @@ -243,14 +263,14 @@ def guess_content_kind(path=None, web_video_data=None, questions=None): { "id": "11111", "question": "

RICE COOKING!!!

", - "type":exercises.SINGLE_SELECTION, + "type": exercises.SINGLE_SELECTION, "all_answers": ["Answer"], "correct_answer": "Answer", }, { "id": "121212", - "question": ' a 2 + b 2 = c 2 ', - "type":exercises.SINGLE_SELECTION, + "question": " a 2 + b 2 = c 2 ", + "type": exercises.SINGLE_SELECTION, "all_answers": ["Answer"], "correct_answer": "Answer", }, @@ -265,7 +285,7 @@ def guess_content_kind(path=None, web_video_data=None, questions=None): { "path": "content://htmltest.zip", } - ] + ], }, { "title": "Rice Exercise 3", @@ -283,13 +303,13 @@ def guess_content_kind(path=None, web_video_data=None, questions=None): { "id": "123456", "question": "Solve: $$(111^{x+1}\\times111^\\frac14)\div111^\\frac12=111^3$$", - "type":exercises.SINGLE_SELECTION, + "type": exercises.SINGLE_SELECTION, "all_answers": ["Yes", "No", "Rice!"], "correct_answer": "Rice!", }, ], }, - ] + ], }, ] SAMPLE_TREE.extend(EXERCISES_NODES) @@ -301,20 +321,23 @@ class SampleChef(SushiChef): We'll call its `main()` method from the command line script. """ - channel_info = { # - 'CHANNEL_SOURCE_DOMAIN': SOURCE_DOMAIN, # who is providing the content (e.g. learningequality.org) - 'CHANNEL_SOURCE_ID': SOURCE_ID, # channel's unique id - 'CHANNEL_TITLE': CHANNEL_TITLE, - 'CHANNEL_LANGUAGE': CHANNEL_LANGUAGE, - 'CHANNEL_THUMBNAIL': 'https://upload.wikimedia.org/wikipedia/commons/thumb/5/50/Banaue_Philippines_Banaue-Rice-Terraces-01.jpg/640px-Banaue_Philippines_Banaue-Rice-Terraces-01.jpg', # (optional) local path or url to image file - 'CHANNEL_DESCRIPTION': 'A sample sushi chef to demo content types.', # (optional) description of the channel (optional) + + channel_info = { # + "CHANNEL_SOURCE_DOMAIN": SOURCE_DOMAIN, # who is providing the content (e.g. learningequality.org) + "CHANNEL_SOURCE_ID": SOURCE_ID, # channel's unique id + "CHANNEL_TITLE": CHANNEL_TITLE, + "CHANNEL_LANGUAGE": CHANNEL_LANGUAGE, + "CHANNEL_THUMBNAIL": "https://upload.wikimedia.org/wikipedia/commons/thumb/5/50/Banaue_Philippines_Banaue-Rice-Terraces-01.jpg/640px-Banaue_Philippines_Banaue-Rice-Terraces-01.jpg", # (optional) local path or url to image file + "CHANNEL_DESCRIPTION": "A sample sushi chef to demo content types.", # (optional) description of the channel (optional) } def construct_channel(self, *args, **kwargs): """ Create ChannelNode and build topic tree. """ - channel = self.get_channel(*args, **kwargs) # creates ChannelNode from data in self.channel_info + channel = self.get_channel( + *args, **kwargs + ) # creates ChannelNode from data in self.channel_info _build_tree(channel, SAMPLE_TREE) raise_for_invalid_channel(channel) @@ -327,8 +350,14 @@ def _build_tree(node, sourcetree): """ for child_source_node in sourcetree: try: - main_file = child_source_node['files'][0] if 'files' in child_source_node else {} - kind = guess_content_kind(path=main_file.get('path'), web_video_data=main_file.get('youtube_id') or main_file.get('web_url'), questions=child_source_node.get("questions")) + main_file = ( + child_source_node["files"][0] if "files" in child_source_node else {} + ) + kind = guess_content_kind( + path=main_file.get("path"), + web_video_data=main_file.get("youtube_id") or main_file.get("web_url"), + questions=child_source_node.get("questions"), + ) except UnknownContentKindError: continue @@ -350,11 +379,15 @@ def _build_tree(node, sourcetree): child_node = nodes.VideoNode( source_id=child_source_node["id"], title=child_source_node["title"], - license=get_license(child_source_node.get("license"), description="Description of license", copyright_holder=child_source_node.get('copyright_holder')), + license=get_license( + child_source_node.get("license"), + description="Description of license", + copyright_holder=child_source_node.get("copyright_holder"), + ), author=child_source_node.get("author"), description=child_source_node.get("description"), - derive_thumbnail=True, # video-specific data - thumbnail=child_source_node.get('thumbnail'), + derive_thumbnail=True, # video-specific data + thumbnail=child_source_node.get("thumbnail"), ) add_files(child_node, child_source_node.get("files") or []) node.add_child(child_node) @@ -386,7 +419,10 @@ def _build_tree(node, sourcetree): node.add_child(child_node) elif kind == content_kinds.EXERCISE: - mastery_model = (child_source_node.get('mastery_model') and {"mastery_model": child_source_node['mastery_model']}) or {} + mastery_model = ( + child_source_node.get("mastery_model") + and {"mastery_model": child_source_node["mastery_model"]} + ) or {} child_node = nodes.ExerciseNode( source_id=child_source_node["id"], title=child_source_node["title"], @@ -429,82 +465,123 @@ def _build_tree(node, sourcetree): add_files(child_node, child_source_node.get("files") or []) node.add_child(child_node) - else: # unknown content file format + else: # unknown content file format continue return node + def add_files(node, file_list): for f in file_list: - path = f.get('path') + path = f.get("path") if path is not None: - abspath = get_abspath(path) # NEW: expand content:// --> ./content/ in file paths + abspath = get_abspath( + path + ) # NEW: expand content:// --> ./content/ in file paths else: abspath = None - file_type = guess_file_type(node.kind, filepath=abspath, youtube_id=f.get('youtube_id'), web_url=f.get('web_url'), encoding=f.get('encoding')) + file_type = guess_file_type( + node.kind, + filepath=abspath, + youtube_id=f.get("youtube_id"), + web_url=f.get("web_url"), + encoding=f.get("encoding"), + ) if file_type == FileTypes.AUDIO_FILE: - node.add_file(files.AudioFile(path=abspath, language=f.get('language'))) + node.add_file(files.AudioFile(path=abspath, language=f.get("language"))) elif file_type == FileTypes.THUMBNAIL: node.add_file(files.ThumbnailFile(path=abspath)) elif file_type == FileTypes.DOCUMENT_FILE: - node.add_file(files.DocumentFile(path=abspath, language=f.get('language'))) + node.add_file(files.DocumentFile(path=abspath, language=f.get("language"))) elif file_type == FileTypes.HTML_ZIP_FILE: - node.add_file(files.HTMLZipFile(path=abspath, language=f.get('language'))) + node.add_file(files.HTMLZipFile(path=abspath, language=f.get("language"))) elif file_type == FileTypes.H5P_FILE: - node.add_file(files.H5PFile(path=abspath, language=f.get('language'))) + node.add_file(files.H5PFile(path=abspath, language=f.get("language"))) elif file_type == FileTypes.VIDEO_FILE: - node.add_file(files.VideoFile(path=abspath, language=f.get('language'), ffmpeg_settings=f.get('ffmpeg_settings'))) + node.add_file( + files.VideoFile( + path=abspath, + language=f.get("language"), + ffmpeg_settings=f.get("ffmpeg_settings"), + ) + ) elif file_type == FileTypes.SUBTITLE_FILE: - node.add_file(files.SubtitleFile(path=abspath, language=f['language'])) + node.add_file(files.SubtitleFile(path=abspath, language=f["language"])) elif file_type == FileTypes.BASE64_FILE: - node.add_file(files.Base64ImageFile(encoding=f['encoding'])) + node.add_file(files.Base64ImageFile(encoding=f["encoding"])) elif file_type == FileTypes.WEB_VIDEO_FILE: - node.add_file(files.WebVideoFile(web_url=f['web_url'], high_resolution=f.get('high_resolution'))) + node.add_file( + files.WebVideoFile( + web_url=f["web_url"], high_resolution=f.get("high_resolution") + ) + ) elif file_type == FileTypes.YOUTUBE_VIDEO_FILE: - node.add_file(files.YouTubeVideoFile(youtube_id=f['youtube_id'], high_resolution=f.get('high_resolution'))) - node.add_file(files.YouTubeSubtitleFile(youtube_id=f['youtube_id'], language='en')) + node.add_file( + files.YouTubeVideoFile( + youtube_id=f["youtube_id"], high_resolution=f.get("high_resolution") + ) + ) + node.add_file( + files.YouTubeSubtitleFile(youtube_id=f["youtube_id"], language="en") + ) else: - raise UnknownFileTypeError("Unrecognized file type '{0}'".format(f['path'])) + raise UnknownFileTypeError("Unrecognized file type '{0}'".format(f["path"])) + def create_question(raw_question): - question = parse_images(raw_question.get('question')) - hints = raw_question.get('hints') - hints = parse_images(hints) if isinstance(hints, str) else [parse_images(hint) for hint in hints or []] + question = parse_images(raw_question.get("question")) + hints = raw_question.get("hints") + hints = ( + parse_images(hints) + if isinstance(hints, str) + else [parse_images(hint) for hint in hints or []] + ) if raw_question["type"] == exercises.MULTIPLE_SELECTION: return questions.MultipleSelectQuestion( id=raw_question["id"], question=question, - correct_answers=[parse_images(answer) for answer in raw_question['correct_answers']], - all_answers=[parse_images(answer) for answer in raw_question['all_answers']], + correct_answers=[ + parse_images(answer) for answer in raw_question["correct_answers"] + ], + all_answers=[ + parse_images(answer) for answer in raw_question["all_answers"] + ], hints=hints, ) if raw_question["type"] == exercises.SINGLE_SELECTION: return questions.SingleSelectQuestion( id=raw_question["id"], question=question, - correct_answer=parse_images(raw_question['correct_answer']), - all_answers=[parse_images(answer) for answer in raw_question['all_answers']], + correct_answer=parse_images(raw_question["correct_answer"]), + all_answers=[ + parse_images(answer) for answer in raw_question["all_answers"] + ], hints=hints, ) if raw_question["type"] == exercises.INPUT_QUESTION: return questions.InputQuestion( id=raw_question["id"], question=question, - answers=[parse_images(answer) for answer in raw_question['answers']], + answers=[parse_images(answer) for answer in raw_question["answers"]], hints=hints, ) if raw_question["type"] == exercises.PERSEUS_QUESTION: return questions.PerseusQuestion( id=raw_question["id"], - raw_data=parse_images(raw_question.get('item_data')), + raw_data=parse_images(raw_question.get("item_data")), source_url="https://www.google.com/", ) else: - raise UnknownQuestionTypeError("Unrecognized question type '{0}': accepted types are {1}".format(raw_question["type"], [key for key, value in exercises.question_choices])) + raise UnknownQuestionTypeError( + "Unrecognized question type '{0}': accepted types are {1}".format( + raw_question["type"], [key for key, value in exercises.question_choices] + ) + ) + def parse_images(content): if content: @@ -515,12 +592,11 @@ def parse_images(content): graphie = re.search(questions.WEB_GRAPHIE_URL_REGEX, path) if graphie: path = graphie.group(1) - content = content.replace(path, get_abspath(path).replace('\\', '\\\\')) + content = content.replace(path, get_abspath(path).replace("\\", "\\\\")) return content - -if __name__ == '__main__': +if __name__ == "__main__": """ This code will run when the sushi chef is called from the command line. """ diff --git a/examples/oldexamples/wikipedia_video_chef.py b/examples/oldexamples/wikipedia_video_chef.py index 885910b5..0736b006 100755 --- a/examples/oldexamples/wikipedia_video_chef.py +++ b/examples/oldexamples/wikipedia_video_chef.py @@ -1,31 +1,39 @@ #!/usr/bin/env python -from bs4 import BeautifulSoup -import requests import tempfile +import requests +from bs4 import BeautifulSoup + from ricecooker.chefs import SushiChef from ricecooker.classes import licenses -from ricecooker.classes.files import HTMLZipFile, SubtitleFile, VideoFile -from ricecooker.classes.nodes import ChannelNode, HTML5AppNode, TopicNode, VideoNode -from ricecooker.utils.caching import CacheForeverHeuristic, FileCache, CacheControlAdapter +from ricecooker.classes.files import HTMLZipFile +from ricecooker.classes.files import SubtitleFile +from ricecooker.classes.files import VideoFile +from ricecooker.classes.nodes import ChannelNode +from ricecooker.classes.nodes import HTML5AppNode +from ricecooker.classes.nodes import TopicNode +from ricecooker.classes.nodes import VideoNode +from ricecooker.utils.caching import CacheControlAdapter +from ricecooker.utils.caching import CacheForeverHeuristic +from ricecooker.utils.caching import FileCache from ricecooker.utils.html import download_file from ricecooker.utils.zip import create_predictable_zip # CHANNEL SETTINGS SOURCE_DOMAIN = "" # -SOURCE_ID = "" # an alphanumeric ID referring to this channel -CHANNEL_TITLE = "" # a human-readable title -CHANNEL_LANGUAGE = "en" # language of channel +SOURCE_ID = "" # an alphanumeric ID referring to this channel +CHANNEL_TITLE = "" # a human-readable title +CHANNEL_LANGUAGE = "en" # language of channel sess = requests.Session() -cache = FileCache('.webcache') +cache = FileCache(".webcache") basic_adapter = CacheControlAdapter(cache=cache) forever_adapter = CacheControlAdapter(heuristic=CacheForeverHeuristic(), cache=cache) -sess.mount('http://', forever_adapter) -sess.mount('https://', forever_adapter) +sess.mount("http://", forever_adapter) +sess.mount("https://", forever_adapter) def make_fully_qualified_url(url): @@ -71,27 +79,38 @@ def get_channel(self, *args, **kwargs): def construct_channel(self, *args, **kwargs): channel = self.get_channel(**kwargs) - videos_topic = TopicNode(source_id="/wiki/Category:Articles_containing_video_clips", - title="Articles containing video clips") + videos_topic = TopicNode( + source_id="/wiki/Category:Articles_containing_video_clips", + title="Articles containing video clips", + ) channel.add_child(videos_topic) - thumbnail_url = 'https://upload.wikimedia.org/wikipedia/commons/thumb/e/ee/A_Is_for_Atom_1953.webm/220px--A_Is_for_Atom_1953.webm.jpg' - page = download_wikipedia_page('/wiki/Category:Articles_containing_video_clips', - thumbnail_url, 'A Is for Atom') + thumbnail_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/e/ee/A_Is_for_Atom_1953.webm/220px--A_Is_for_Atom_1953.webm.jpg" + page = download_wikipedia_page( + "/wiki/Category:Articles_containing_video_clips", + thumbnail_url, + "A Is for Atom", + ) videos_topic.add_child(page) - video_url = 'https://upload.wikimedia.org/wikipedia/commons/e/ee/A_Is_for_Atom_1953.webm' + video_url = "https://upload.wikimedia.org/wikipedia/commons/e/ee/A_Is_for_Atom_1953.webm" video_file = VideoFile(path=video_url) - video_node = VideoNode(title='A Is for Atom 1953', source_id='A_Is_for_Atom_1953.webm', - files=[video_file], license=licenses.PublicDomainLicense()) + video_node = VideoNode( + title="A Is for Atom 1953", + source_id="A_Is_for_Atom_1953.webm", + files=[video_file], + license=licenses.PublicDomainLicense(), + ) - subtitle_url = 'https://commons.wikimedia.org/w/api.php?action=timedtext&title=File%3AA_Is_for_Atom_1953.webm&lang={}&trackformat=srt' + subtitle_url = "https://commons.wikimedia.org/w/api.php?action=timedtext&title=File%3AA_Is_for_Atom_1953.webm&lang={}&trackformat=srt" subtitle_languages = [ - 'en', - 'es', + "en", + "es", ] for lang in subtitle_languages: - subtitle_file = SubtitleFile(path=subtitle_url.format(lang), language=lang, subtitlesformat='srt') + subtitle_file = SubtitleFile( + path=subtitle_url.format(lang), language=lang, subtitlesformat="srt" + ) video_node.add_file(subtitle_file) videos_topic.add_child(video_node) @@ -133,14 +152,15 @@ def process_wikipedia_page(content, baseurl, destpath, **kwargs): page = BeautifulSoup(content, "html.parser") for image in page.find_all("img"): - rel_path, _ = download_file(make_fully_qualified_url(image["src"]), destpath, - request_fn=make_request) + rel_path, _ = download_file( + make_fully_qualified_url(image["src"]), destpath, request_fn=make_request + ) image["src"] = rel_path return str(page) -if __name__ == '__main__': +if __name__ == "__main__": """ Call this script using: ./wikipedia_video_chef.py --token= diff --git a/examples/tutorial/sushichef.py b/examples/tutorial/sushichef.py index a9516283..3631dac9 100755 --- a/examples/tutorial/sushichef.py +++ b/examples/tutorial/sushichef.py @@ -1,10 +1,18 @@ #!/usr/bin/env python +from le_utils.constants import licenses from ricecooker.chefs import SushiChef -from ricecooker.classes.nodes import ChannelNode, HTML5AppNode, TopicNode, VideoNode, DocumentNode, AudioNode -from ricecooker.classes.files import DocumentFile, VideoFile, AudioFile -from le_utils.constants import licenses +from ricecooker.classes.files import AudioFile +from ricecooker.classes.files import DocumentFile +from ricecooker.classes.files import VideoFile from ricecooker.classes.licenses import get_license +from ricecooker.classes.nodes import AudioNode +from ricecooker.classes.nodes import ChannelNode +from ricecooker.classes.nodes import DocumentNode +from ricecooker.classes.nodes import HTML5AppNode +from ricecooker.classes.nodes import TopicNode +from ricecooker.classes.nodes import VideoNode + class TutorialChef(SushiChef): """ @@ -14,13 +22,13 @@ class TutorialChef(SushiChef): # 1. PROVIDE CHANNEL INFO (replace with your own values) ############################################################################ channel_info = { - 'CHANNEL_SOURCE_DOMAIN': '', # who is providing the content (e.g. learningequality.org) - 'CHANNEL_SOURCE_ID': '', # channel's unique id - 'CHANNEL_TITLE': 'The tutorial channel', - 'CHANNEL_LANGUAGE': 'en', + "CHANNEL_SOURCE_DOMAIN": "", # who is providing the content (e.g. learningequality.org) + "CHANNEL_SOURCE_ID": "", # channel's unique id + "CHANNEL_TITLE": "The tutorial channel", + "CHANNEL_LANGUAGE": "en", # 'CHANNEL_THUMBNAIL': 'http://yourdomain.org/img/logo.jpg', # (optional) local path or url to image file # 'CHANNEL_DESCRIPTION': 'What is this channel about?', # (optional) description of the channel (optional) - } + } # 2. CONSTRUCT CHANNEL ############################################################################ @@ -31,8 +39,7 @@ def construct_channel(self, *args, **kwargs): """ # Create channel ######################################################################## - channel = self.get_channel(*args, **kwargs) # uses self.channel_info - + channel = self.get_channel(*args, **kwargs) # uses self.channel_info # Create topics to add to your channel ######################################################################## @@ -53,30 +60,56 @@ def construct_channel(self, *args, **kwargs): exampletopic.add_child(examplesubtopic) # TODO: Add your subtopic to your topic here - # Content # You can add documents (pdfs and ePubs), videos, audios, and other content ######################################################################## # let's create a document file called 'Example PDF' document_file = DocumentFile(path="http://www.pdf995.com/samples/pdf.pdf") - examplepdf = DocumentNode(title="Example PDF", source_id="example-pdf", files=[document_file], license=get_license(licenses.PUBLIC_DOMAIN)) + examplepdf = DocumentNode( + title="Example PDF", + source_id="example-pdf", + files=[document_file], + license=get_license(licenses.PUBLIC_DOMAIN), + ) # TODO: Create your pdf file here (use any url to a .pdf file) # We are also going to add a video file called 'Example Video' - video_file = VideoFile(path="https://ia600209.us.archive.org/27/items/RiceChef/Rice Chef.mp4") - fancy_license = get_license(licenses.SPECIAL_PERMISSIONS, description='Special license for ricecooker fans only.', copyright_holder='The chef video makers') - examplevideo = VideoNode(title="Example Video", source_id="example-video", files=[video_file], license=fancy_license) + video_file = VideoFile( + path="https://ia600209.us.archive.org/27/items/RiceChef/Rice Chef.mp4" + ) + fancy_license = get_license( + licenses.SPECIAL_PERMISSIONS, + description="Special license for ricecooker fans only.", + copyright_holder="The chef video makers", + ) + examplevideo = VideoNode( + title="Example Video", + source_id="example-video", + files=[video_file], + license=fancy_license, + ) # TODO: Create your video file here (use any url to a .mp4 file) # Finally, we are creating an audio file called 'Example Audio' - audio_file = AudioFile(path="https://ia802508.us.archive.org/5/items/testmp3testfile/mpthreetest.mp3") - exampleaudio = AudioNode(title="Example Audio", source_id="example-audio", files=[audio_file], license=get_license(licenses.PUBLIC_DOMAIN)) + audio_file = AudioFile( + path="https://ia802508.us.archive.org/5/items/testmp3testfile/mpthreetest.mp3" + ) + exampleaudio = AudioNode( + title="Example Audio", + source_id="example-audio", + files=[audio_file], + license=get_license(licenses.PUBLIC_DOMAIN), + ) # TODO: Create your audio file here (use any url to a .mp3 file) # Now that we have our files, let's add them to our channel - channel.add_child(examplepdf) # Adding 'Example PDF' to your channel - exampletopic.add_child(examplevideo) # Adding 'Example Video' to 'Example Topic' - examplesubtopic.add_child(exampleaudio) # Adding 'Example Audio' to 'Example Subtopic' + channel.add_child(examplepdf) # Adding 'Example PDF' to your channel + exampletopic.add_child( + examplevideo + ) # Adding 'Example Video' to 'Example Topic' + examplesubtopic.add_child( + exampleaudio + ) # Adding 'Example Audio' to 'Example Subtopic' # TODO: Add your pdf file to your channel # TODO: Add your video file to your topic @@ -87,7 +120,7 @@ def construct_channel(self, *args, **kwargs): return channel -if __name__ == '__main__': +if __name__ == "__main__": """ This code will run when the sushi chef is called from the command line. """ diff --git a/examples/wikipedia/README.md b/examples/wikipedia/README.md index c097fef7..361cc137 100644 --- a/examples/wikipedia/README.md +++ b/examples/wikipedia/README.md @@ -8,4 +8,3 @@ packages their contents as standalone `HTMLZipFile`s and uploads them to Studio. ## Running the script ./sushichef.py --token=YOURSTUDIOTOKENHERE9139139f3a23232 - diff --git a/examples/wikipedia/sushichef.py b/examples/wikipedia/sushichef.py index 3516978d..e2eed934 100755 --- a/examples/wikipedia/sushichef.py +++ b/examples/wikipedia/sushichef.py @@ -1,32 +1,38 @@ #!/usr/bin/env python -from bs4 import BeautifulSoup -import requests import tempfile +import requests +from bs4 import BeautifulSoup + from ricecooker.chefs import SushiChef from ricecooker.classes import licenses from ricecooker.classes.files import HTMLZipFile -from ricecooker.classes.nodes import ChannelNode, HTML5AppNode, TopicNode -from ricecooker.utils.caching import CacheForeverHeuristic, FileCache, CacheControlAdapter +from ricecooker.classes.nodes import ChannelNode +from ricecooker.classes.nodes import HTML5AppNode +from ricecooker.classes.nodes import TopicNode +from ricecooker.config import LOGGER +from ricecooker.utils.caching import CacheControlAdapter +from ricecooker.utils.caching import CacheForeverHeuristic +from ricecooker.utils.caching import FileCache from ricecooker.utils.html import download_file from ricecooker.utils.zip import create_predictable_zip -from ricecooker.config import LOGGER # CHANNEL SETTINGS -SOURCE_DOMAIN = "" # -SOURCE_ID = "" # an alphanumeric ID refering to this channel -CHANNEL_TITLE = "" # a humand-readbale title -CHANNEL_LANGUAGE = "en" # language of channel +SOURCE_DOMAIN = "" # +SOURCE_ID = "" # an alphanumeric ID refering to this channel +CHANNEL_TITLE = "" # a humand-readbale title +CHANNEL_LANGUAGE = "en" # language of channel sess = requests.Session() -cache = FileCache('.webcache') +cache = FileCache(".webcache") basic_adapter = CacheControlAdapter(cache=cache) -forever_adapter= CacheControlAdapter(heuristic=CacheForeverHeuristic(), cache=cache) +forever_adapter = CacheControlAdapter(heuristic=CacheForeverHeuristic(), cache=cache) + +sess.mount("http://", forever_adapter) +sess.mount("https://", forever_adapter) -sess.mount('http://', forever_adapter) -sess.mount('https://', forever_adapter) def make_fully_qualified_url(url): if url.startswith("//"): @@ -38,6 +44,7 @@ def make_fully_qualified_url(url): return None return url + def make_request(url, *args, **kwargs): response = sess.get(url, *args, **kwargs) if response.status_code != 200: @@ -46,14 +53,13 @@ def make_request(url, *args, **kwargs): LOGGER.warning("NOT CACHED: " + url) return response + def get_parsed_html_from_url(url, *args, **kwargs): html = make_request(url, *args, **kwargs).content return BeautifulSoup(html, "html.parser") - class WikipediaChef(SushiChef): - def get_channel(self, *args, **kwargs): channel = ChannelNode( @@ -71,16 +77,21 @@ def construct_channel(self, *args, **kwargs): channel = self.get_channel(**kwargs) citrus_topic = TopicNode(source_id="List_of_citrus_fruits", title="Citrus!") channel.add_child(citrus_topic) - add_subpages_from_wikipedia_list(citrus_topic, "https://en.wikipedia.org/wiki/List_of_citrus_fruits") + add_subpages_from_wikipedia_list( + citrus_topic, "https://en.wikipedia.org/wiki/List_of_citrus_fruits" + ) - potato_topic = TopicNode(source_id="List_of_potato_cultivars", title="Potatoes!") + potato_topic = TopicNode( + source_id="List_of_potato_cultivars", title="Potatoes!" + ) channel.add_child(potato_topic) - add_subpages_from_wikipedia_list(potato_topic, "https://en.wikipedia.org/wiki/List_of_potato_cultivars") + add_subpages_from_wikipedia_list( + potato_topic, "https://en.wikipedia.org/wiki/List_of_potato_cultivars" + ) return channel - def add_subpages_from_wikipedia_list(topic, list_url): # to understand how the following parsing works, look at: @@ -119,7 +130,9 @@ def add_subpages_from_wikipedia_list(topic, list_url): # attempt to extract a thumbnail for the subpage, from the second column in the table image = columns[1].find("img") thumbnail_url = make_fully_qualified_url(image["src"]) if image else None - if thumbnail_url and not (thumbnail_url.endswith("jpg") or thumbnail_url.endswith("png")): + if thumbnail_url and not ( + thumbnail_url.endswith("jpg") or thumbnail_url.endswith("png") + ): thumbnail_url = None # download the wikipedia page into an HTML5 app node @@ -163,14 +176,15 @@ def process_wikipedia_page(content, baseurl, destpath, **kwargs): page = BeautifulSoup(content, "html.parser") for image in page.find_all("img"): - relpath, _ = download_file(make_fully_qualified_url(image["src"]), destpath, request_fn=make_request) + relpath, _ = download_file( + make_fully_qualified_url(image["src"]), destpath, request_fn=make_request + ) image["src"] = relpath return str(page) - -if __name__ == '__main__': +if __name__ == "__main__": """ Call this script using: ./sushichef.py --token=YOURSTUDIOTOKENHERE9139139f3a23232 diff --git a/resources/scripts/convertvideo.bat b/resources/scripts/convertvideo.bat index cc255430..2712ebf2 100644 --- a/resources/scripts/convertvideo.bat +++ b/resources/scripts/convertvideo.bat @@ -3,7 +3,7 @@ TITLE Video conversion and compression script REM Video conversion and compression script Learning Equality 2018 REM Usage: REM convertvideo.bat inputfile.mpg [outputfile.mp4] -REM +REM REM This script will perform the following conversion steps: REM - Apply CRF 32 compression (very aggressive; may need to adjust below) REM - Limit the audio track to 32k/sec @@ -23,7 +23,7 @@ IF %ERRORLEVEL% NEQ 0 ( ) -REM 2. Parse input filename +REM 2. Parse input filename REM ############################################################################ IF NOT "%~1" == "" ( set "INFILE=%~1" diff --git a/resources/scripts/convertvideo.sh b/resources/scripts/convertvideo.sh index db2fa58b..d1f128f9 100755 --- a/resources/scripts/convertvideo.sh +++ b/resources/scripts/convertvideo.sh @@ -19,7 +19,7 @@ then exit 1 fi -# 2. Parse input filename +# 2. Parse input filename ################################################################################ if [ ! -z "$1" ] then diff --git a/resources/templates/csv_channel/Channel.csv b/resources/templates/csv_channel/Channel.csv index 89f8be9d..c64b22f9 100644 --- a/resources/templates/csv_channel/Channel.csv +++ b/resources/templates/csv_channel/Channel.csv @@ -1 +1 @@ -Title,Description,Domain,Source ID,Language,Thumbnail \ No newline at end of file +Title,Description,Domain,Source ID,Language,Thumbnail diff --git a/resources/templates/csv_channel/Content.csv b/resources/templates/csv_channel/Content.csv index 70835bd2..514d5865 100644 --- a/resources/templates/csv_channel/Content.csv +++ b/resources/templates/csv_channel/Content.csv @@ -1 +1 @@ -Path *,Title *,Source ID,Description,Author,Language,License ID *,License Description,Copyright Holder,Thumbnail \ No newline at end of file +Path *,Title *,Source ID,Description,Author,Language,License ID *,License Description,Copyright Holder,Thumbnail diff --git a/resources/templates/csv_channel/ExerciseQuestions.csv b/resources/templates/csv_channel/ExerciseQuestions.csv index d58f0e0a..c31b0ae9 100644 --- a/resources/templates/csv_channel/ExerciseQuestions.csv +++ b/resources/templates/csv_channel/ExerciseQuestions.csv @@ -1 +1 @@ -Source ID *,Question ID *,Question type *,Question *,Option A,Option B,Option C,Option D,Option E,Options F...,Correct Answer *,Correct Answer 2,Correct Answer 3,Hint 1,Hint 2,Hint 3,Hint 4,Hint 5,Hint 6+ \ No newline at end of file +Source ID *,Question ID *,Question type *,Question *,Option A,Option B,Option C,Option D,Option E,Options F...,Correct Answer *,Correct Answer 2,Correct Answer 3,Hint 1,Hint 2,Hint 3,Hint 4,Hint 5,Hint 6+ diff --git a/resources/templates/csv_channel/Exercises.csv b/resources/templates/csv_channel/Exercises.csv index d6c8cd34..cdb7ea85 100644 --- a/resources/templates/csv_channel/Exercises.csv +++ b/resources/templates/csv_channel/Exercises.csv @@ -1 +1 @@ -Path *,Title *,Source ID *,Description,Author,Language,License ID *,License Description,Copyright Holder,Number Correct,Out of Total,Randomize,Thumbnail \ No newline at end of file +Path *,Title *,Source ID *,Description,Author,Language,License ID *,License Description,Copyright Holder,Number Correct,Out of Total,Randomize,Thumbnail diff --git a/resources/templates/csv_channel/csvchef.py b/resources/templates/csv_channel/csvchef.py index 8d0ed58d..a0c6b6e4 100755 --- a/resources/templates/csv_channel/csvchef.py +++ b/resources/templates/csv_channel/csvchef.py @@ -1,15 +1,18 @@ #!/usr/bin/env python from ricecooker.chefs import LineCook + class CsvChef(LineCook): """ Sushi chef for creating Kolibri Studio channels from local files and metdata provided in Channel.csv and Content.csv. """ + pass # no custom methods needed: the `LineCook` base class will do the cheffing. # Run `python csvchef.py -h` to see all the supported command line options -if __name__ == '__main__': + +if __name__ == "__main__": chef = CsvChef() chef.main() diff --git a/ricecooker/__init__.py b/ricecooker/__init__.py index 7274a0f1..23243245 100644 --- a/ricecooker/__init__.py +++ b/ricecooker/__init__.py @@ -1,8 +1,8 @@ # -*- coding: utf-8 -*- -__author__ = 'Learning Equality' -__email__ = 'info@learningequality.org' -__version__ = '0.7.0b2' +__author__ = "Learning Equality" +__email__ = "info@learningequality.org" +__version__ = "0.7.0b2" import sys diff --git a/ricecooker/chefs.py b/ricecooker/chefs.py index 92d36976..d60d0387 100644 --- a/ricecooker/chefs.py +++ b/ricecooker/chefs.py @@ -1,13 +1,14 @@ import argparse +import csv import json import logging import os -import requests -import sys -import csv import re +import sys from datetime import datetime +import requests + from . import config from .classes import files from .classes import nodes @@ -15,7 +16,6 @@ from .exceptions import InvalidUsageException from .exceptions import raise_for_invalid_channel from .managers.progress import Status - from .utils.downloader import get_archive_filename from .utils.jsontrees import build_tree_from_json from .utils.jsontrees import get_channel_node_from_json @@ -28,22 +28,24 @@ from .utils.metadata_provider import DEFAULT_EXERCISE_QUESTIONS_INFO_FILENAME from .utils.metadata_provider import DEFAULT_EXERCISES_INFO_FILENAME from .utils.tokens import get_content_curation_token -from .utils.youtube import YouTubeVideoUtils, YouTubePlaylistUtils - +from .utils.youtube import YouTubePlaylistUtils +from .utils.youtube import YouTubeVideoUtils from ricecooker.utils.images import convert_image # SUSHI CHEF BASE CLASS ################################################################################ + class SushiChef(object): """ This is the base class that all content integration scripts should subclass. Sushi chef scripts call the `main` method as the entry point, which in turn calls the `run` method to do the work (see `uploadchannel` in `commands.py`). """ + CHEF_RUN_DATA = config.CHEF_DATA_DEFAULT # loaded from chefdata/chef_data.json - TREES_DATA_DIR = config.TREES_DATA_DIR # tree archives and JsonTreeChef inputs + TREES_DATA_DIR = config.TREES_DATA_DIR # tree archives and JsonTreeChef inputs channel_node_class = nodes.ChannelNode @@ -55,7 +57,7 @@ def __init__(self, *args, **kwargs): # persistent settings for the chef, we check if it exists first in order to # support assignment as a class-level variable. - if not hasattr(self, 'SETTINGS'): + if not hasattr(self, "SETTINGS"): self.SETTINGS = {} # these will be assigned to later by the argparse handling. @@ -64,9 +66,9 @@ def __init__(self, *args, **kwargs): # ARGPARSE SETUP # We don't want to add argparse help if subclass has an __init__ method - subclasses = self.__class__.__mro__[:-2] # all subclasses after this - if any(['__init__' in c.__dict__.keys() for c in subclasses]): - add_parser_help = False # assume subclass' __init__ will add help + subclasses = self.__class__.__mro__[:-2] # all subclasses after this + if any(["__init__" in c.__dict__.keys() for c in subclasses]): + add_parser_help = False # assume subclass' __init__ will add help else: add_parser_help = True parser = argparse.ArgumentParser( @@ -75,28 +77,99 @@ def __init__(self, *args, **kwargs): ) self.arg_parser = parser # save as class attr. for subclasses to extend # ARGS - parser.add_argument('command', nargs='?', default='uploadchannel', help='Desired action: dryrun or uploadchannel (default).') - parser.add_argument('--token', default='#', help='Studio API Access Token (specify wither the token value or the path of a file that contains the token).') - parser.add_argument('-u', '--update', action='store_true', help='Force file re-download (skip .ricecookerfilecache/).') - parser.add_argument('--debug', action='store_true', help='Print extra debugging infomation.') - parser.add_argument('-v', '--verbose', action='store_true', default=True, help='Verbose mode (default).') - parser.add_argument('--warn', action='store_true', help='Print errors and warnings.') - parser.add_argument('--quiet', action='store_true', help='Print only errors.') - parser.add_argument('--compress', action='store_true', help='Compress videos using ffmpeg -crf=32 -b:a 32k mono.') - parser.add_argument('--thumbnails', action='store_true', help='Automatically generate thumbnails for content nodes.') - parser.add_argument('--download-attempts',type=int,default=3, help='Maximum number of times to retry downloading files.') - parser.add_argument('--resume', action='store_true', help='Resume chef session from a specified step.') + parser.add_argument( + "command", + nargs="?", + default="uploadchannel", + help="Desired action: dryrun or uploadchannel (default).", + ) + parser.add_argument( + "--token", + default="#", + help="Studio API Access Token (specify wither the token value or the path of a file that contains the token).", + ) + parser.add_argument( + "-u", + "--update", + action="store_true", + help="Force file re-download (skip .ricecookerfilecache/).", + ) + parser.add_argument( + "--debug", action="store_true", help="Print extra debugging infomation." + ) + parser.add_argument( + "-v", + "--verbose", + action="store_true", + default=True, + help="Verbose mode (default).", + ) + parser.add_argument( + "--warn", action="store_true", help="Print errors and warnings." + ) + parser.add_argument("--quiet", action="store_true", help="Print only errors.") + parser.add_argument( + "--compress", + action="store_true", + help="Compress videos using ffmpeg -crf=32 -b:a 32k mono.", + ) + parser.add_argument( + "--thumbnails", + action="store_true", + help="Automatically generate thumbnails for content nodes.", + ) + parser.add_argument( + "--download-attempts", + type=int, + default=3, + help="Maximum number of times to retry downloading files.", + ) + parser.add_argument( + "--resume", + action="store_true", + help="Resume chef session from a specified step.", + ) allsteps = [step.name.upper() for step in Status] - parser.add_argument('--step',choices=allsteps,default='LAST', help='Step to resume progress from (use with the --resume).') - parser.add_argument('--prompt', action='store_true', help='Prompt user to open the channel after the chef run.') - parser.add_argument('--deploy', dest='stage', action='store_false', - help='Immediately deploy changes to channel\'s main tree. This operation will overwrite the previous channel content. Use only during development.') - parser.add_argument('--publish', action='store_true', help='Publish newly uploaded version of the channel.') - parser.add_argument('--sample', type=int, metavar='SIZE', help='Upload a sample of SIZE nodes from the channel.') - parser.add_argument('--reset', dest="reset_deprecated", action='store_true', - help='(deprecated) Restarting the chef run is the default.') - parser.add_argument('--stage', dest='stage_deprecated', action='store_true', - help='(deprecated) Stage updated content for review. Uploading a staging tree is now the default behavior. Use --deploy to upload to the main tree.') + parser.add_argument( + "--step", + choices=allsteps, + default="LAST", + help="Step to resume progress from (use with the --resume).", + ) + parser.add_argument( + "--prompt", + action="store_true", + help="Prompt user to open the channel after the chef run.", + ) + parser.add_argument( + "--deploy", + dest="stage", + action="store_false", + help="Immediately deploy changes to channel's main tree. This operation will overwrite the previous channel content. Use only during development.", + ) + parser.add_argument( + "--publish", + action="store_true", + help="Publish newly uploaded version of the channel.", + ) + parser.add_argument( + "--sample", + type=int, + metavar="SIZE", + help="Upload a sample of SIZE nodes from the channel.", + ) + parser.add_argument( + "--reset", + dest="reset_deprecated", + action="store_true", + help="(deprecated) Restarting the chef run is the default.", + ) + parser.add_argument( + "--stage", + dest="stage_deprecated", + action="store_true", + help="(deprecated) Stage updated content for review. Uploading a staging tree is now the default behavior. Use --deploy to upload to the main tree.", + ) # [OPTIONS] --- extra key=value options are supported, but do not appear in help @@ -118,11 +191,11 @@ def get_setting(self, setting, default=None): override = None # If there is a command line flag for this setting, allow for it to override the chef # default. Note that these are all boolean flags, so they are true if set, false if not. - if setting == 'generate-missing-thumbnails': - override = self.args and self.args['thumbnails'] + if setting == "generate-missing-thumbnails": + override = self.args and self.args["thumbnails"] - if setting == 'compress-videos': - override = self.args and self.args['compress'] + if setting == "compress-videos": + override = self.args and self.args["compress"] if setting in self.SETTINGS: return override or self.SETTINGS[setting] @@ -146,28 +219,35 @@ def parse_args_and_options(self): # Handle case when command is not specified but key=value options are allcommands = [ - 'uploadchannel', # Whole pipeline: pre_run > run > [deploy,publish] - 'dryrun', # Do pre_run and run but do not upload to Studio + "uploadchannel", # Whole pipeline: pre_run > run > [deploy,publish] + "dryrun", # Do pre_run and run but do not upload to Studio ] - command_arg = args['command'] - if command_arg not in allcommands and '=' in command_arg: + command_arg = args["command"] + if command_arg not in allcommands and "=" in command_arg: # a key=value options pair was incorrectly recognized as the command - args['command'] = 'uploadchannel' + args["command"] = "uploadchannel" options_list.append(command_arg) # put command_arg where it belongs # Print CLI deprecation warnings info - if args['stage_deprecated']: - config.LOGGER.warning('DEPRECATION WARNING: --stage is now the default bevavior. The --stage flag has been deprecated and will be removed in ricecooker 1.0.') - if args['reset_deprecated']: + if args["stage_deprecated"]: + config.LOGGER.warning( + "DEPRECATION WARNING: --stage is now the default bevavior. The --stage flag has been deprecated and will be removed in ricecooker 1.0." + ) + if args["reset_deprecated"]: config.LOGGER.warning( - 'DEPRECATION WARNING: --reset is now the default bevavior. The --reset flag has been deprecated and will be removed in ricecooker 1.0.') - if args['publish'] and args['stage']: - raise InvalidUsageException('The --publish argument must be used together with --deploy argument.') - logging_args = [key for key in ['quiet', 'warn', 'debug'] if args[key]] + "DEPRECATION WARNING: --reset is now the default bevavior. The --reset flag has been deprecated and will be removed in ricecooker 1.0." + ) + if args["publish"] and args["stage"]: + raise InvalidUsageException( + "The --publish argument must be used together with --deploy argument." + ) + logging_args = [key for key in ["quiet", "warn", "debug"] if args[key]] if len(logging_args) > 1: - raise InvalidUsageException('Agruments --quiet, --warn, and --debug cannot be used together.') + raise InvalidUsageException( + "Agruments --quiet, --warn, and --debug cannot be used together." + ) - if args['command'] == 'uploadchannel': + if args["command"] == "uploadchannel": # Make sure token is provided. There are four ways to specify: # 1. --token=path to token-containing file # 2. --token=140fefe...1f3 @@ -175,16 +255,18 @@ def parse_args_and_options(self): # 3. we look for environment variable STUDIO_TOKEN # 4. else prompt user # If ALL of these fail, this call will raise and chef run will stop. - args['token'] = get_content_curation_token(args['token']) + args["token"] = get_content_curation_token(args["token"]) # Parse additional keyword arguments from `options_list` options = {} for preoption in options_list: try: - option_key, option_value = preoption.split('=') + option_key, option_value = preoption.split("=") options.update({option_key.strip(): option_value.strip()}) except IndexError: - msg = "Invalid option '{0}': use [key]=[value] format (no whitespace)".format(preoption) + msg = "Invalid option '{0}': use [key]=[value] format (no whitespace)".format( + preoption + ) raise InvalidUsageException(msg) self.args = args @@ -192,7 +274,6 @@ def parse_args_and_options(self): return args, options - def config_logger(self, args, options): """ Set up stream (stderr), local file logging (logs/yyyy-mm-dd__HHMM.log). @@ -201,11 +282,11 @@ def config_logger(self, args, options): """ # Set desired logging level based on command line arguments level = logging.INFO - if args['debug']: + if args["debug"]: level = logging.DEBUG - elif args['warn']: + elif args["warn"]: level = logging.WARNING - elif args['quiet']: + elif args["quiet"]: level = logging.ERROR # 2. File handler (logs/yyyy-mm-dd__HHMM.log) @@ -222,8 +303,7 @@ def config_logger(self, args, options): config.setup_logging(level=level, main_log=main_log, error_log=error_log) except Exception as e: - config.LOGGER.warning('Unable to setup file logging due to %s' % e) - + config.LOGGER.warning("Unable to setup file logging due to %s" % e) def get_channel(self, **kwargs): """ @@ -234,37 +314,46 @@ def get_channel(self, **kwargs): kwargs (dict): additional keyword arguments given to `uploadchannel` Returns: an empty `ChannelNode` that contains all the channel metadata """ - if hasattr(self, 'channel_info'): + if hasattr(self, "channel_info"): # Make sure we're not using the template id values in `channel_info` - template_domains = [''] - using_template_domain = self.channel_info['CHANNEL_SOURCE_DOMAIN'] in template_domains + template_domains = [""] + using_template_domain = ( + self.channel_info["CHANNEL_SOURCE_DOMAIN"] in template_domains + ) if using_template_domain: - config.LOGGER.error("Template source domain detected. Please change CHANNEL_SOURCE_DOMAIN before running this chef.") + config.LOGGER.error( + "Template source domain detected. Please change CHANNEL_SOURCE_DOMAIN before running this chef." + ) - template_ids = ['', ''] - using_template_source_id = self.channel_info['CHANNEL_SOURCE_ID'] in template_ids + template_ids = ["", ""] + using_template_source_id = ( + self.channel_info["CHANNEL_SOURCE_ID"] in template_ids + ) if using_template_source_id: - config.LOGGER.error("Template channel source ID detected. Please change CHANNEL_SOURCE_ID before running this chef.") + config.LOGGER.error( + "Template channel source ID detected. Please change CHANNEL_SOURCE_ID before running this chef." + ) if using_template_domain or using_template_source_id: - sys.exit(1) + sys.exit(1) # If a sublass has an `channel_info` attribute (dict) it doesn't need # to define a `get_channel` method and instead rely on this code: channel = self.channel_node_class( - source_domain=self.channel_info['CHANNEL_SOURCE_DOMAIN'], - source_id=self.channel_info['CHANNEL_SOURCE_ID'], - title=self.channel_info['CHANNEL_TITLE'], - tagline=self.channel_info.get('CHANNEL_TAGLINE'), - channel_id=self.channel_info.get('CHANNEL_ID'), - thumbnail=self.channel_info.get('CHANNEL_THUMBNAIL'), - language=self.channel_info.get('CHANNEL_LANGUAGE'), - description=self.channel_info.get('CHANNEL_DESCRIPTION'), + source_domain=self.channel_info["CHANNEL_SOURCE_DOMAIN"], + source_id=self.channel_info["CHANNEL_SOURCE_ID"], + title=self.channel_info["CHANNEL_TITLE"], + tagline=self.channel_info.get("CHANNEL_TAGLINE"), + channel_id=self.channel_info.get("CHANNEL_ID"), + thumbnail=self.channel_info.get("CHANNEL_THUMBNAIL"), + language=self.channel_info.get("CHANNEL_LANGUAGE"), + description=self.channel_info.get("CHANNEL_DESCRIPTION"), ) return channel else: - raise NotImplementedError('Subclass must define get_channel method or have a channel_info (dict) attribute.') - + raise NotImplementedError( + "Subclass must define get_channel method or have a channel_info (dict) attribute." + ) def construct_channel(self, **kwargs): """ @@ -273,27 +362,38 @@ def construct_channel(self, **kwargs): kwargs (dict): additional keyword arguments given to `uploadchannel` Returns: a `ChannelNode` object representing the populated topic tree """ - raise NotImplementedError('Chef subclass must implement this method') - + raise NotImplementedError("Chef subclass must implement this method") def load_chef_data(self): if os.path.exists(config.DATA_PATH): self.CHEF_RUN_DATA = json.load(open(config.DATA_PATH)) - def save_channel_tree_as_json(self, channel): - filename = os.path.join(self.TREES_DATA_DIR, '{}.json'.format(self.CHEF_RUN_DATA['current_run'])) + filename = os.path.join( + self.TREES_DATA_DIR, "{}.json".format(self.CHEF_RUN_DATA["current_run"]) + ) os.makedirs(self.TREES_DATA_DIR, exist_ok=True) - json.dump(channel.get_json_tree(), open(filename, 'w'), indent=2) - self.CHEF_RUN_DATA['tree_archives']['previous'] = self.CHEF_RUN_DATA['tree_archives']['current'] - self.CHEF_RUN_DATA['tree_archives']['current'] = filename.replace(os.getcwd() + '/', '') + json.dump(channel.get_json_tree(), open(filename, "w"), indent=2) + self.CHEF_RUN_DATA["tree_archives"]["previous"] = self.CHEF_RUN_DATA[ + "tree_archives" + ]["current"] + self.CHEF_RUN_DATA["tree_archives"]["current"] = filename.replace( + os.getcwd() + "/", "" + ) self.save_chef_data() def save_channel_metadata_as_csv(self, channel): # create data folder in chefdata - DATA_DIR = os.path.join('chefdata', 'data') - os.makedirs(DATA_DIR, exist_ok = True) - metadata_csv = csv.writer(open(os.path.join(DATA_DIR, 'content_metadata.csv'), 'w', newline='', encoding='utf-8')) + DATA_DIR = os.path.join("chefdata", "data") + os.makedirs(DATA_DIR, exist_ok=True) + metadata_csv = csv.writer( + open( + os.path.join(DATA_DIR, "content_metadata.csv"), + "w", + newline="", + encoding="utf-8", + ) + ) metadata_csv.writerow(config.CSV_HEADERS) channel.save_channel_children_to_csv(metadata_csv) @@ -301,34 +401,40 @@ def save_channel_metadata_as_csv(self, channel): def load_channel_metadata_from_csv(self): metadata_dict = dict() metadata_csv = None - CSV_FILE_PATH = os.path.join('chefdata', 'data', 'content_metadata.csv') + CSV_FILE_PATH = os.path.join("chefdata", "data", "content_metadata.csv") if os.path.exists(CSV_FILE_PATH): - metadata_csv = csv.DictReader(open(CSV_FILE_PATH, 'r', encoding='utf-8')) + metadata_csv = csv.DictReader(open(CSV_FILE_PATH, "r", encoding="utf-8")) for line in metadata_csv: # Add to metadata_dict any updated data. Skip if none - line_source_id = line['Source ID'] - line_new_title = line['New Title'] - line_new_description = line['New Description'] - line_new_tags = line['New Tags'] - if line_new_title != '' or line_new_description != '' or line_new_tags != '': + line_source_id = line["Source ID"] + line_new_title = line["New Title"] + line_new_description = line["New Description"] + line_new_tags = line["New Tags"] + if ( + line_new_title != "" + or line_new_description != "" + or line_new_tags != "" + ): metadata_dict[line_source_id] = {} - if line_new_title != '': - metadata_dict[line_source_id]['New Title'] = line_new_title - if line_new_description != '': - metadata_dict[line_source_id]['New Description'] = line_new_description - if line_new_tags != '': - tags_arr = re.split(',| ,', line_new_tags) - metadata_dict[line_source_id]['New Tags'] = tags_arr + if line_new_title != "": + metadata_dict[line_source_id]["New Title"] = line_new_title + if line_new_description != "": + metadata_dict[line_source_id][ + "New Description" + ] = line_new_description + if line_new_tags != "": + tags_arr = re.split(",| ,", line_new_tags) + metadata_dict[line_source_id]["New Tags"] = tags_arr return metadata_dict def save_chef_data(self): - json.dump(self.CHEF_RUN_DATA, open(config.DATA_PATH, 'w'), indent=2) + json.dump(self.CHEF_RUN_DATA, open(config.DATA_PATH, "w"), indent=2) - def apply_modifications(self, contentNode, metadata_dict = {}): + def apply_modifications(self, contentNode, metadata_dict={}): # Skip if no metadata file passed in or no updates in metadata_dict if metadata_dict == {}: return - + is_channel = isinstance(contentNode, ChannelNode) if not is_channel: @@ -338,7 +444,6 @@ def apply_modifications(self, contentNode, metadata_dict = {}): for child in contentNode.children: self.apply_modifications(child, metadata_dict) - def pre_run(self, args, options): """ This function is called before the Chef's `run` mehod is called. @@ -350,7 +455,6 @@ def pre_run(self, args, options): """ pass - def run(self, args, options): """ This function calls uploadchannel which performs all the run steps: @@ -359,18 +463,22 @@ def run(self, args, options): options (dict): additional key=value options given on command line """ args_copy = args.copy() - args_copy['token'] = args_copy['token'][0:6] + '...' - config.LOGGER.info('In SushiChef.run method. args=' + str(args_copy) + ' options=' + str(options)) + args_copy["token"] = args_copy["token"][0:6] + "..." + config.LOGGER.info( + "In SushiChef.run method. args=" + + str(args_copy) + + " options=" + + str(options) + ) run_id = datetime.now().strftime("%Y-%m-%d__%H%M") - self.CHEF_RUN_DATA['current_run'] = run_id - self.CHEF_RUN_DATA['runs'].append({'id': run_id}) + self.CHEF_RUN_DATA["current_run"] = run_id + self.CHEF_RUN_DATA["runs"].append({"id": run_id}) # TODO(Kevin): move self.download_content() call here self.pre_run(args, options) uploadchannel_wrapper(self, args, options) - def main(self): """ Main entry point that content integration scripts should call. @@ -380,11 +488,10 @@ def main(self): self.run(args, options) - - # JSON TREE CHEF ################################################################################ + class JsonTreeChef(SushiChef): """ This sushi chef loads the data from a channel from a ricecooker json tree file @@ -425,13 +532,16 @@ class JsonTreeChef(SushiChef): Each object in the json tree correponds to a TopicNode, a ContentNode that contains a Files or an Exercise that contains Question. """ - RICECOOKER_JSON_TREE = 'ricecooker_json_tree.json' + + RICECOOKER_JSON_TREE = "ricecooker_json_tree.json" def pre_run(self, args, options): """ This function is called before `run` to create the json tree file. """ - raise NotImplementedError('JsonTreeChef subclass must implement the `pre_run` method.') + raise NotImplementedError( + "JsonTreeChef subclass must implement the `pre_run` method." + ) def get_json_tree_path(self, *args, **kwargs): """ @@ -455,106 +565,129 @@ def construct_channel(self, **kwargs): channel = self.get_channel(**kwargs) json_tree_path = self.get_json_tree_path(**kwargs) json_tree = read_tree_from_json(json_tree_path) - build_tree_from_json(channel, json_tree['children']) + build_tree_from_json(channel, json_tree["children"]) raise_for_invalid_channel(channel) return channel - - # SOUSCHEF LINECOOK ################################################################################ + class LineCook(JsonTreeChef): """ This sushi chef uses os.walk to import the content in `channeldir` folder `directory structure + CSV metadata files --> Kolibri channel`. Folders and CSV files can be creaed by hand or by a `souschef` script. """ + metadata_provider = None def __init__(self, *args, **kwargs): super(LineCook, self).__init__(*args, **kwargs) # We don't want to add argparse help if subclass has an __init__ method - subclasses = self.__class__.__mro__[:-5] # all subclasses after this - if any(['__init__' in c.__dict__.keys() for c in subclasses]): - add_parser_help = False # assume subclass' __init__ will add help + subclasses = self.__class__.__mro__[:-5] # all subclasses after this + if any(["__init__" in c.__dict__.keys() for c in subclasses]): + add_parser_help = False # assume subclass' __init__ will add help else: add_parser_help = True self.arg_parser = argparse.ArgumentParser( description="Upload the folder hierarchy to the content workshop.", add_help=add_parser_help, - parents=[self.arg_parser] + parents=[self.arg_parser], ) - self.arg_parser.add_argument('--channeldir', required=True, + self.arg_parser.add_argument( + "--channeldir", + required=True, action=FolderExistsAction, - help='The directory that corresponds to the root of the channel.') - self.arg_parser.add_argument('--channelinfo', + help="The directory that corresponds to the root of the channel.", + ) + self.arg_parser.add_argument( + "--channelinfo", default=DEFAULT_CHANNEL_INFO_FILENAME, - help='Filename for the channel metadata (assumed to be sibling of channeldir)') - self.arg_parser.add_argument('--contentinfo', + help="Filename for the channel metadata (assumed to be sibling of channeldir)", + ) + self.arg_parser.add_argument( + "--contentinfo", default=DEFAULT_CONTENT_INFO_FILENAME, - help='Filename for content metadata (assumed to be sibling of channeldir)') - self.arg_parser.add_argument('--exercisesinfo', + help="Filename for content metadata (assumed to be sibling of channeldir)", + ) + self.arg_parser.add_argument( + "--exercisesinfo", default=DEFAULT_EXERCISES_INFO_FILENAME, - help='Filename for execises metadata (assumed to be sibling of channeldir)') - self.arg_parser.add_argument('--questionsinfo', + help="Filename for execises metadata (assumed to be sibling of channeldir)", + ) + self.arg_parser.add_argument( + "--questionsinfo", default=DEFAULT_EXERCISE_QUESTIONS_INFO_FILENAME, - help='Filename for execise questions metadata (assumed to be sibling of channeldir)') - self.arg_parser.add_argument('--generate', action='store_true', - help='Generate metadata files from directory stucture.') - self.arg_parser.add_argument('--importstudioid', - help='Generate CSV metadata from a specified studio_id (e.g. studio_id of main_tree for some channel)') - + help="Filename for execise questions metadata (assumed to be sibling of channeldir)", + ) + self.arg_parser.add_argument( + "--generate", + action="store_true", + help="Generate metadata files from directory stucture.", + ) + self.arg_parser.add_argument( + "--importstudioid", + help="Generate CSV metadata from a specified studio_id (e.g. studio_id of main_tree for some channel)", + ) def _init_metadata_provider(self, args, options): - if args['contentinfo'].endswith('.csv'): - metadata_provider = CsvMetadataProvider(args['channeldir'], - channelinfo=args['channelinfo'], - contentinfo=args['contentinfo'], - exercisesinfo=args['exercisesinfo'], - questionsinfo=args['questionsinfo']) + if args["contentinfo"].endswith(".csv"): + metadata_provider = CsvMetadataProvider( + args["channeldir"], + channelinfo=args["channelinfo"], + contentinfo=args["contentinfo"], + exercisesinfo=args["exercisesinfo"], + questionsinfo=args["questionsinfo"], + ) else: - raise ValueError('Uknown contentinfo file format ' + args['contentinfo']) + raise ValueError("Uknown contentinfo file format " + args["contentinfo"]) self.metadata_provider = metadata_provider def pre_run(self, args, options): """ This function is called before `run` in order to build the json tree. """ - if 'generate' in args and args['generate']: - self.metadata_provider = CsvMetadataProvider(args['channeldir'], - channelinfo=args['channelinfo'], - contentinfo=args['contentinfo'], - exercisesinfo=args['exercisesinfo'], - questionsinfo=args['questionsinfo'], - validate_and_cache=False) + if "generate" in args and args["generate"]: + self.metadata_provider = CsvMetadataProvider( + args["channeldir"], + channelinfo=args["channelinfo"], + contentinfo=args["contentinfo"], + exercisesinfo=args["exercisesinfo"], + questionsinfo=args["questionsinfo"], + validate_and_cache=False, + ) self.metadata_provider.generate_templates(exercise_questions=True) self.metadata_provider.generate_contentinfo_from_channeldir(args, options) sys.exit(0) - elif 'importstudioid' in args and args['importstudioid']: - studio_id = args['importstudioid'] + elif "importstudioid" in args and args["importstudioid"]: + studio_id = args["importstudioid"] config.LOGGER.info("Calling with importstudioid... " + studio_id) - self.metadata_provider = CsvMetadataProvider(args['channeldir'], - channelinfo=args['channelinfo'], - contentinfo=args['contentinfo'], - exercisesinfo=args['exercisesinfo'], - questionsinfo=args['questionsinfo'], - validate_and_cache=False) + self.metadata_provider = CsvMetadataProvider( + args["channeldir"], + channelinfo=args["channelinfo"], + contentinfo=args["contentinfo"], + exercisesinfo=args["exercisesinfo"], + questionsinfo=args["questionsinfo"], + validate_and_cache=False, + ) self.metadata_provider.generate_templates(exercise_questions=True) self.metadata_provider.generate_exercises_from_importstudioid(args, options) sys.exit(0) if self.metadata_provider is None: self._init_metadata_provider(args, options) - kwargs = {} # combined dictionary of argparse args and extra options + kwargs = {} # combined dictionary of argparse args and extra options kwargs.update(args) kwargs.update(options) json_tree_path = self.get_json_tree_path(**kwargs) - build_ricecooker_json_tree(args, options, self.metadata_provider, json_tree_path) + build_ricecooker_json_tree( + args, options, self.metadata_provider, json_tree_path + ) class YouTubeSushiChef(SushiChef): @@ -566,10 +699,12 @@ class YouTubeSushiChef(SushiChef): """ CONTENT_ARCHIVE_VERSION = 1 - DATA_DIR = os.path.abspath('chefdata') + DATA_DIR = os.path.abspath("chefdata") YOUTUBE_CACHE_DIR = os.path.join(DATA_DIR, "youtubecache") - DOWNLOADS_DIR = os.path.join(DATA_DIR, 'downloads') - ARCHIVE_DIR = os.path.join(DOWNLOADS_DIR, 'archive_{}'.format(CONTENT_ARCHIVE_VERSION)) + DOWNLOADS_DIR = os.path.join(DATA_DIR, "downloads") + ARCHIVE_DIR = os.path.join( + DOWNLOADS_DIR, "archive_{}".format(CONTENT_ARCHIVE_VERSION) + ) USE_PROXY = False def get_playlist_ids(self): @@ -619,8 +754,8 @@ def get_metadata_for_video(self, field, youtube_id=None, playlist_id=None): return metadata[youtube_id][field] elif playlist_id and playlist_id in metadata and field in metadata[playlist_id]: return metadata[playlist_id][field] - elif field in metadata['defaults']: - return metadata['defaults'][field] + elif field in metadata["defaults"]: + return metadata["defaults"][field] return None @@ -635,16 +770,18 @@ def create_nodes_for_playlists(self): for playlist_id in self.get_playlist_ids(): - playlist = YouTubePlaylistUtils(id=playlist_id, cache_dir=self.YOUTUBE_CACHE_DIR) + playlist = YouTubePlaylistUtils( + id=playlist_id, cache_dir=self.YOUTUBE_CACHE_DIR + ) playlist_info = playlist.get_playlist_info(use_proxy=self.USE_PROXY) # Get channel description if there is any - playlist_description = '' + playlist_description = "" if playlist_info["description"]: playlist_description = playlist_info["description"] - topic_source_id = 'playlist-{0}'.format(playlist_id) + topic_source_id = "playlist-{0}".format(playlist_id) topic_node = nodes.TopicNode( title=playlist_info["title"], source_id=topic_source_id, @@ -658,7 +795,9 @@ def create_nodes_for_playlists(self): for child in playlist_info["children"]: # check for duplicate videos if child["id"] not in video_ids: - video_node = self.create_video_node(child, parent_id=topic_source_id) + video_node = self.create_video_node( + child, parent_id=topic_source_id + ) if video_node: topic_node.add_child(video_node) video_ids.append(child["id"]) @@ -668,7 +807,7 @@ def create_nodes_for_playlists(self): return playlist_nodes - def create_video_node(self, video_id, parent_id='', playlist_id=None): + def create_video_node(self, video_id, parent_id="", playlist_id=None): video = YouTubeVideoUtils(id=video_id, cache_dir=False) video_details = video.get_video_info(use_proxy=self.USE_PROXY) if not video_details: @@ -679,7 +818,9 @@ def create_video_node(self, video_id, parent_id='', playlist_id=None): # Check youtube thumbnail extension as some are not supported formats thumbnail_link = video_details["thumbnail"] config.LOGGER.info("thumbnail = {}".format(thumbnail_link)) - archive_filename = get_archive_filename(thumbnail_link, download_root=self.ARCHIVE_DIR) + archive_filename = get_archive_filename( + thumbnail_link, download_root=self.ARCHIVE_DIR + ) dest_file = os.path.join(self.ARCHIVE_DIR, archive_filename) os.makedirs(os.path.dirname(dest_file), exist_ok=True) @@ -689,12 +830,15 @@ def create_video_node(self, video_id, parent_id='', playlist_id=None): response = requests.get(thumbnail_link, stream=True) # Some images that YT returns are actually webp despite their extension, # so make sure we update our file extension to match. - if 'Content-Type' in response.headers and response.headers['Content-Type'] == 'image/webp': + if ( + "Content-Type" in response.headers + and response.headers["Content-Type"] == "image/webp" + ): base_path, ext = os.path.splitext(dest_file) - dest_file = base_path + '.webp' + dest_file = base_path + ".webp" if response.status_code == 200: - with open(dest_file, 'wb') as f: + with open(dest_file, "wb") as f: for chunk in response.iter_content(1024): f.write(chunk) @@ -706,17 +850,21 @@ def create_video_node(self, video_id, parent_id='', playlist_id=None): title=video_details["title"], description=video_details["description"], language=self.channel_info["CHANNEL_LANGUAGE"], - author=self.get_metadata_for_video('author', video_id, playlist_id) or '', - provider=self.get_metadata_for_video('provider', video_id, playlist_id) or '', + author=self.get_metadata_for_video("author", video_id, playlist_id) or "", + provider=self.get_metadata_for_video("provider", video_id, playlist_id) + or "", thumbnail=dest_file, - license=self.get_metadata_for_video('license', video_id, playlist_id), + license=self.get_metadata_for_video("license", video_id, playlist_id), files=[ files.YouTubeVideoFile( youtube_id=video_id, language="en", - high_resolution=self.get_metadata_for_video('high_resolution', video_id, playlist_id) or False + high_resolution=self.get_metadata_for_video( + "high_resolution", video_id, playlist_id + ) + or False, ) - ] + ], ) return video_node @@ -737,7 +885,9 @@ def construct_channel(self, *args, **kwargs): channel = self.get_channel(*args, **kwargs) if len(self.get_playlist_ids()) == 0 and len(self.get_video_ids()) == 0: - raise NotImplementedError("Either get_playlist_ids() or get_video_ids() must be implemented.") + raise NotImplementedError( + "Either get_playlist_ids() or get_video_ids() must be implemented." + ) # TODO: Replace next line with chef code nodes = self.create_nodes_for_playlists() @@ -749,4 +899,3 @@ def construct_channel(self, *args, **kwargs): channel.add_child(node) return channel - diff --git a/ricecooker/classes/files.py b/ricecooker/classes/files.py index 807e558c..4bb88ed2 100644 --- a/ricecooker/classes/files.py +++ b/ricecooker/classes/files.py @@ -1,41 +1,58 @@ # Node models to represent channel's tree from __future__ import unicode_literals -from cachecontrol.caches.file_cache import FileCache import hashlib -import os -from PIL import Image import json -from requests.exceptions import HTTPError, ConnectionError, InvalidURL, InvalidSchema +import os import shutil -from subprocess import CalledProcessError import tempfile -import youtube_dl import zipfile - +from subprocess import CalledProcessError from urllib.parse import urlparse +import youtube_dl +from cachecontrol.caches.file_cache import FileCache +from le_utils.constants import exercises +from le_utils.constants import file_formats +from le_utils.constants import format_presets from le_utils.constants import languages -from le_utils.constants import file_formats, format_presets, exercises -from ricecooker.utils.encodings import get_base64_encoding, write_base64_to_file -from ricecooker.utils.images import create_image_from_pdf_page +from PIL import Image +from requests.exceptions import ConnectionError +from requests.exceptions import HTTPError +from requests.exceptions import InvalidSchema +from requests.exceptions import InvalidURL + +from .. import config +from ..exceptions import UnknownFileTypeError +from ricecooker.utils.encodings import get_base64_encoding +from ricecooker.utils.encodings import write_base64_to_file from ricecooker.utils.images import create_image_from_epub +from ricecooker.utils.images import create_image_from_pdf_page from ricecooker.utils.images import create_image_from_zip -from ricecooker.utils.videos import extract_thumbnail_from_video from ricecooker.utils.images import create_tiled_image from ricecooker.utils.images import ThumbnailGenerationError from ricecooker.utils.subtitles import build_subtitle_converter_from_file +from ricecooker.utils.subtitles import InvalidSubtitleFormatError +from ricecooker.utils.subtitles import InvalidSubtitleLanguageError from ricecooker.utils.subtitles import LANGUAGE_CODE_UNKNOWN -from ricecooker.utils.subtitles import InvalidSubtitleFormatError, InvalidSubtitleLanguageError -from ricecooker.utils.videos import guess_video_preset_by_resolution, compress_video, VideoCompressionError +from ricecooker.utils.videos import compress_video +from ricecooker.utils.videos import extract_thumbnail_from_video +from ricecooker.utils.videos import guess_video_preset_by_resolution +from ricecooker.utils.videos import VideoCompressionError from ricecooker.utils.youtube import YouTubeResource -from .. import config -from ..exceptions import UnknownFileTypeError - # Cache for filenames FILECACHE = FileCache(config.FILECACHE_DIRECTORY, use_dir_lock=True, forever=True) -HTTP_CAUGHT_EXCEPTIONS = (HTTPError, ConnectionError, InvalidURL, UnicodeDecodeError, UnicodeError, InvalidSchema, IOError, AssertionError) +HTTP_CAUGHT_EXCEPTIONS = ( + HTTPError, + ConnectionError, + InvalidURL, + UnicodeDecodeError, + UnicodeError, + InvalidSchema, + IOError, + AssertionError, +) # Lookup table for convertible file formats for a given preset # used for converting avi/flv/etc. videos and srt subtitles @@ -50,39 +67,41 @@ def extract_path_ext(path, default_ext=None): _, dotext = os.path.splitext(path) if dotext: ext = dotext[1:] - if len(ext) > 3 and '?' in ext: # strip off any querystring if present - ext = ext.split('?')[0] + if len(ext) > 3 and "?" in ext: # strip off any querystring if present + ext = ext.split("?")[0] else: ext = None if not ext and default_ext: ext = default_ext if not ext: - raise ValueError('No extension in path {} and default_ext is None'.format(path)) + raise ValueError("No extension in path {} and default_ext is None".format(path)) return ext.lower() def generate_key(action, path_or_id, settings=None, default=" (default)"): - """ generate_key: generate key used for caching - Args: - action (str): how video is being processed (e.g. COMPRESSED or DOWNLOADED) - path_or_id (str): path to video or youtube_id - settings (dict): settings for compression or downloading passed in by user - default (str): if settings are None, default to this extension (avoid overwriting keys) - Returns: filename + """generate_key: generate key used for caching + Args: + action (str): how video is being processed (e.g. COMPRESSED or DOWNLOADED) + path_or_id (str): path to video or youtube_id + settings (dict): settings for compression or downloading passed in by user + default (str): if settings are None, default to this extension (avoid overwriting keys) + Returns: filename """ - if settings and 'postprocessors' in settings: + if settings and "postprocessors" in settings: # get determinisic dict serialization for nested dicts under Python 3.5 settings_str = json.dumps(settings, sort_keys=True) else: # keep using old strategy to avoid invalidating all chef caches - settings_str = "{}".format(str(sorted(settings.items()))) if settings else default + settings_str = ( + "{}".format(str(sorted(settings.items()))) if settings else default + ) return "{}: {} {}".format(action.upper(), path_or_id, settings_str) def get_cache_filename(key): cache_file = FILECACHE.get(key) if cache_file: - cache_file = cache_file.decode('utf-8') + cache_file = cache_file.decode("utf-8") # if the file was somehow deleted, make sure we don't return it. if not os.path.exists(config.get_storage_path(cache_file)): cache_file = None @@ -127,7 +146,7 @@ def download(path, default_ext=None): tempf.seek(0) # Get extension of file or use `default_ext` if none found ext = extract_path_ext(path, default_ext=default_ext) - filename = '{0}.{ext}'.format(hash.hexdigest(), ext=ext) + filename = "{0}.{ext}".format(hash.hexdigest(), ext=ext) copy_file_to_storage(filename, tempf) FILECACHE.set(key, bytes(filename, "utf-8")) config.LOGGER.info("\t--- Downloaded {}".format(filename)) @@ -135,7 +154,9 @@ def download(path, default_ext=None): return filename -def download_and_convert_video(path, default_ext=file_formats.MP4, ffmpeg_settings=None): +def download_and_convert_video( + path, default_ext=file_formats.MP4, ffmpeg_settings=None +): """ Auto-converting variant of download function that handles all video formats. """ @@ -167,7 +188,7 @@ def download_and_convert_video(path, default_ext=file_formats.MP4, ffmpeg_settin with tempfile.TemporaryFile() as tempf2: hash = write_and_get_hash(converted_path, tempf2) tempf2.seek(0) - filename = '{0}.{ext}'.format(hash.hexdigest(), ext=file_formats.MP4) + filename = "{0}.{ext}".format(hash.hexdigest(), ext=file_formats.MP4) copy_file_to_storage(filename, tempf2) os.unlink(converted_path) FILECACHE.set(key, bytes(filename, "utf-8")) @@ -179,7 +200,7 @@ def is_valid_url(path): Return `True` if path is a valid URL, else `False` if path is a local path. """ parts = urlparse(path) - return parts.scheme != '' and parts.netloc != '' + return parts.scheme != "" and parts.netloc != "" def write_and_get_hash(path, write_to_file, hash=None): @@ -207,7 +228,7 @@ def write_and_get_hash(path, write_to_file, hash=None): hash.update(chunk) else: # CASE B: path points to a local filesystem file - with open(path, 'rb') as fobj: + with open(path, "rb") as fobj: for chunk in iter(lambda: fobj.read(2097152), b""): write_to_file.write(chunk) hash.update(chunk) @@ -223,10 +244,10 @@ def copy_file_to_storage(filename, srcfile, delete_original=False): """ # Some files might have been closed, so only filepath will work if isinstance(srcfile, str): - srcfile = open(srcfile, 'rb') + srcfile = open(srcfile, "rb") # Write file to local storage - with open(config.get_storage_path(filename), 'wb') as destf: + with open(config.get_storage_path(filename), "wb") as destf: if delete_original: shutil.move(srcfile.name, destf.name) else: @@ -235,7 +256,7 @@ def copy_file_to_storage(filename, srcfile, delete_original=False): def get_hash(filepath): file_hash = hashlib.md5() - with open(filepath, 'rb') as fobj: + with open(filepath, "rb") as fobj: for chunk in iter(lambda: fobj.read(2097152), b""): file_hash.update(chunk) return file_hash.hexdigest() @@ -247,7 +268,12 @@ def compress_video_file(filename, ffmpeg_settings): stored in storage. Returns the filename of the compressed file. """ ffmpeg_settings = ffmpeg_settings or {} - key = generate_key("COMPRESSED", filename, settings=ffmpeg_settings, default=" (default compression)") + key = generate_key( + "COMPRESSED", + filename, + settings=ffmpeg_settings, + default=" (default compression)", + ) cache_file = get_cache_filename(key) if not config.UPDATE and cache_file: @@ -255,9 +281,13 @@ def compress_video_file(filename, ffmpeg_settings): config.LOGGER.info("\t--- Compressing {}".format(filename)) - tempf = tempfile.NamedTemporaryFile(suffix=".{}".format(file_formats.MP4), delete=False) - tempf.close() # Need to close so pressure cooker can write to file - compress_video(config.get_storage_path(filename), tempf.name, overwrite=True, **ffmpeg_settings) + tempf = tempfile.NamedTemporaryFile( + suffix=".{}".format(file_formats.MP4), delete=False + ) + tempf.close() # Need to close so pressure cooker can write to file + compress_video( + config.get_storage_path(filename), tempf.name, overwrite=True, **ffmpeg_settings + ) compressedfilename = "{}.{}".format(get_hash(tempf.name), file_formats.MP4) copy_file_to_storage(compressedfilename, tempf.name) @@ -266,7 +296,9 @@ def compress_video_file(filename, ffmpeg_settings): return compressedfilename -def download_from_web(web_url, download_settings, file_format=file_formats.MP4, ext="", download_ext=""): +def download_from_web( + web_url, download_settings, file_format=file_formats.MP4, ext="", download_ext="" +): """ Download `web_url` using YoutubeDL using `download_settings` options. Args: @@ -289,7 +321,7 @@ def download_from_web(web_url, download_settings, file_format=file_formats.MP4, # Get hash of web_url to act as temporary storage name url_hash = hashlib.md5() - url_hash.update(web_url.encode('utf-8')) + url_hash.update(web_url.encode("utf-8")) tempfilename = "{}{ext}".format(url_hash.hexdigest(), ext=ext) outtmpl_path = os.path.join(tempfile.gettempdir(), tempfilename) download_settings["outtmpl"] = outtmpl_path @@ -307,13 +339,13 @@ def download_from_web(web_url, download_settings, file_format=file_formats.MP4, with youtube_dl.YoutubeDL(download_settings) as ydl: ydl.download([web_url]) if not os.path.exists(destination_path): - raise youtube_dl.utils.DownloadError('Failed to download ' + web_url) + raise youtube_dl.utils.DownloadError("Failed to download " + web_url) else: # Connect to YouTube via an HTTP proxy yt_resource = YouTubeResource(web_url, useproxy=True, options=download_settings) result1 = yt_resource.get_resource_info() if result1 is None: - raise youtube_dl.utils.DownloadError('Failed to get resource info') + raise youtube_dl.utils.DownloadError("Failed to get resource info") download_settings["writethumbnail"] = False # overwrite default behaviour if file_format == file_formats.VTT: # We need to use the proxy when downloading subtitles @@ -322,11 +354,15 @@ def download_from_web(web_url, download_settings, file_format=file_formats.MP4, # For video files we can skip the proxy for faster download speed result2 = yt_resource.download(options=download_settings) if result2 is None or not os.path.exists(destination_path): - raise youtube_dl.utils.DownloadError('Failed to download resource ' + web_url) + raise youtube_dl.utils.DownloadError( + "Failed to download resource " + web_url + ) # Write file to local storage filename = "{}.{}".format(get_hash(destination_path), file_format) - with open(destination_path, "rb") as dlf, open(config.get_storage_path(filename), 'wb') as destf: + with open(destination_path, "rb") as dlf, open( + config.get_storage_path(filename), "wb" + ) as destf: shutil.copyfileobj(dlf, destf) FILECACHE.set(key, bytes(filename, "utf-8")) @@ -334,7 +370,6 @@ def download_from_web(web_url, download_settings, file_format=file_formats.MP4, class ThumbnailPresetMixin(object): - def get_preset(self): thumbnail_preset = self.node.get_thumbnail_preset() if thumbnail_preset is None: @@ -375,31 +410,42 @@ def validate(self): def get_preset(self): if self.preset: return self.preset - raise NotImplementedError("preset must be set if preset isn't specified when creating File object") + raise NotImplementedError( + "preset must be set if preset isn't specified when creating File object" + ) def get_filename(self): return self.filename or self.process_file() - + @property def checksum(self): return self.get_filename().split(".")[0] - + @property def extension(self): return self.get_filename().split(".")[1] - + @property def size(self): return os.path.getsize(config.get_storage_path(self.get_filename())) def truncate_fields(self): - if self.original_filename and len(self.original_filename) > config.MAX_ORIGINAL_FILENAME_LENGTH: - config.print_truncate("original_filename", self.node.source_id, self.original_filename) - self.original_filename = self.original_filename[:config.MAX_ORIGINAL_FILENAME_LENGTH] + if ( + self.original_filename + and len(self.original_filename) > config.MAX_ORIGINAL_FILENAME_LENGTH + ): + config.print_truncate( + "original_filename", self.node.source_id, self.original_filename + ) + self.original_filename = self.original_filename[ + : config.MAX_ORIGINAL_FILENAME_LENGTH + ] if self.source_url and len(self.source_url) > config.MAX_SOURCE_URL_LENGTH: - config.print_truncate("file_source_url", self.node.source_id, self.source_url) - self.source_url = self.source_url[:config.MAX_SOURCE_URL_LENGTH] + config.print_truncate( + "file_source_url", self.node.source_id, self.source_url + ) + self.source_url = self.source_url[: config.MAX_SOURCE_URL_LENGTH] def to_dict(self): filename = self.get_filename() @@ -409,15 +455,17 @@ def to_dict(self): if filename: if os.path.isfile(config.get_storage_path(filename)): return { - 'size': self.size, - 'preset': self.get_preset(), - 'filename': filename, - 'original_filename': self.original_filename, - 'language': self.language, - 'source_url': self.source_url, + "size": self.size, + "preset": self.get_preset(), + "filename": filename, + "original_filename": self.original_filename, + "language": self.language, + "source_url": self.source_url, } else: - config.LOGGER.warning("File not found: {}".format(config.get_storage_path(filename))) + config.LOGGER.warning( + "File not found: {}".format(config.get_storage_path(filename)) + ) return None @@ -441,9 +489,12 @@ def validate(self): ext = extract_path_ext(self.path, default_ext=self.default_ext) # don't validate for single-digit extension, or no extension if len(ext) > 1: - assert ext in self.allowed_formats, "{} must have one of the " \ + assert ext in self.allowed_formats, ( + "{} must have one of the " "following extensions: {} (instead, got '{}' from '{}')".format( - self.__class__.__name__, self.allowed_formats, ext, self.path) + self.__class__.__name__, self.allowed_formats, ext, self.path + ) + ) def process_file(self): try: @@ -489,7 +540,7 @@ def process_file(self): image_path = config.get_storage_path(self.filename) img = Image.open(image_path) img.verify() - except IOError as e: # Catch invalid or broken thumbnail files + except IOError as e: # Catch invalid or broken thumbnail files self.filename = None self.error = e config.FAILED_FILES.append(self) @@ -538,7 +589,7 @@ def process_file(self): # make sure index.html exists unless this is a dependency (i.e. shared resources) zip if not self.get_preset() == format_presets.HTML5_DEPENDENCY_ZIP: with zipfile.ZipFile(config.get_storage_path(self.filename)) as zf: - _info = zf.getinfo('index.html') + _info = zf.getinfo("index.html") except KeyError as err: self.filename = None self.error = err @@ -565,7 +616,9 @@ def __init__(self, path, ffmpeg_settings=None, **kwargs): super(VideoFile, self).__init__(path, **kwargs) def get_preset(self): - return self.preset or guess_video_preset_by_resolution(config.get_storage_path(self.filename)) + return self.preset or guess_video_preset_by_resolution( + config.get_storage_path(self.filename) + ) def validate(self): """ @@ -573,16 +626,25 @@ def validate(self): """ assert self.path, "{} must have a path".format(self.__class__.__name__) ext = extract_path_ext(self.path, default_ext=self.default_ext) - if ext not in self.allowed_formats and ext not in CONVERTIBLE_FORMATS[format_presets.VIDEO_HIGH_RES]: - raise ValueError('Incompatible extension {} for VideoFile at {}'.format(ext, self.path)) + if ( + ext not in self.allowed_formats + and ext not in CONVERTIBLE_FORMATS[format_presets.VIDEO_HIGH_RES] + ): + raise ValueError( + "Incompatible extension {} for VideoFile at {}".format(ext, self.path) + ) def process_unsupported_video_file(self): """ Download video at self.path, convert to mp4, and return converted filename. """ try: - self.filename = download_and_convert_video(self.path, ffmpeg_settings=self.ffmpeg_settings) - config.LOGGER.info("\t--- Downloaded and converted {}".format(self.filename)) + self.filename = download_and_convert_video( + self.path, ffmpeg_settings=self.ffmpeg_settings + ) + config.LOGGER.info( + "\t--- Downloaded and converted {}".format(self.filename) + ) return self.filename except HTTP_CAUGHT_EXCEPTIONS as err: self.error = err @@ -590,8 +652,13 @@ def process_unsupported_video_file(self): def process_file(self): ext = extract_path_ext(self.path, default_ext=self.default_ext) - if ext not in self.allowed_formats and ext not in CONVERTIBLE_FORMATS[format_presets.VIDEO_HIGH_RES]: - raise ValueError('Incompatible extension {} for VideoFile at {}'.format(ext, self.path)) + if ( + ext not in self.allowed_formats + and ext not in CONVERTIBLE_FORMATS[format_presets.VIDEO_HIGH_RES] + ): + raise ValueError( + "Incompatible extension {} for VideoFile at {}".format(ext, self.path) + ) try: if ext not in self.allowed_formats: # Handle videos that don't have an .mp4 or .webm extension @@ -601,9 +668,16 @@ def process_file(self): self.filename = super(VideoFile, self).process_file() # Compress the video if compress flag is set or ffmpeg settings were given if self.filename and (self.ffmpeg_settings or config.COMPRESS): - self.filename = compress_video_file(self.filename, self.ffmpeg_settings) + self.filename = compress_video_file( + self.filename, self.ffmpeg_settings + ) config.LOGGER.info("\t--- Compressed {}".format(self.filename)) - except (BrokenPipeError, CalledProcessError, IOError, VideoCompressionError) as err: + except ( + BrokenPipeError, + CalledProcessError, + IOError, + VideoCompressionError, + ) as err: # Catch errors related to ffmpeg and handle silently self.filename = None self.error = err @@ -616,23 +690,37 @@ class WebVideoFile(File): is_primary = True # In future, look into postprocessors and progress_hooks - def __init__(self, web_url, download_settings=None, high_resolution=False, maxheight=None, **kwargs): + def __init__( + self, + web_url, + download_settings=None, + high_resolution=False, + maxheight=None, + **kwargs + ): self.web_url = web_url self.download_settings = download_settings or {} if "format" not in self.download_settings: maxheight = maxheight or (720 if high_resolution else 480) # Download the best mp4 format available, or best webm format available, or any other best mp4 - self.download_settings['format'] = "bestvideo[height<={maxheight}][ext=mp4]+bestaudio[ext=m4a]/bestvideo[height<={maxheight}][ext=webm]+bestaudio[ext=webm]/best[height<={maxheight}][ext=mp4]".format( - maxheight=maxheight) + self.download_settings[ + "format" + ] = "bestvideo[height<={maxheight}][ext=mp4]+bestaudio[ext=m4a]/bestvideo[height<={maxheight}][ext=webm]+bestaudio[ext=webm]/best[height<={maxheight}][ext=mp4]".format( + maxheight=maxheight + ) # self.download_settings['recodevideo'] = file_formats.MP4 super(WebVideoFile, self).__init__(**kwargs) def get_preset(self): - return self.preset or guess_video_preset_by_resolution(config.get_storage_path(self.filename)) + return self.preset or guess_video_preset_by_resolution( + config.get_storage_path(self.filename) + ) def process_file(self): try: - self.filename = download_from_web(self.web_url, self.download_settings, ext=".{}".format(file_formats.MP4)) + self.filename = download_from_web( + self.web_url, self.download_settings, ext=".{}".format(file_formats.MP4) + ) config.LOGGER.info("\t--- Downloaded (YouTube) {}".format(self.filename)) # Compress if compression flag is set @@ -647,9 +735,12 @@ def process_file(self): return self.filename + class YouTubeVideoFile(WebVideoFile): def __init__(self, youtube_id, **kwargs): - super(YouTubeVideoFile, self).__init__('http://www.youtube.com/watch?v={}'.format(youtube_id), **kwargs) + super(YouTubeVideoFile, self).__init__( + "http://www.youtube.com/watch?v={}".format(youtube_id), **kwargs + ) def _get_language_with_alpha2_fallback(language_code): @@ -695,10 +786,12 @@ class YouTubeSubtitleFile(File): """ def __init__(self, youtube_id, language=None, **kwargs): - self.youtube_url = 'http://www.youtube.com/watch?v={}'.format(youtube_id) + self.youtube_url = "http://www.youtube.com/watch?v={}".format(youtube_id) if isinstance(language, languages.Language): language = language.code - self.youtube_language = language # save youtube language code (can differ from internal repr.) + self.youtube_language = ( + language # save youtube language code (can differ from internal repr.) + ) language_obj = _get_language_with_alpha2_fallback(language) super(YouTubeSubtitleFile, self).__init__(language=language_obj.code, **kwargs) assert self.language, "Subtitles must have a language" @@ -712,20 +805,31 @@ def process_file(self): config.LOGGER.info("\t--- Downloaded subtitle {}".format(self.filename)) return self.filename except (FileNotFoundError, youtube_dl.utils.DownloadError): - self.error = str("Subtitle with langauge {} is not available for {}".format(self.language, self.youtube_url)) + self.error = str( + "Subtitle with langauge {} is not available for {}".format( + self.language, self.youtube_url + ) + ) config.FAILED_FILES.append(self) def download_subtitle(self): settings = { - 'skip_download': True, - 'writesubtitles': True, - 'subtitleslangs': [self.youtube_language], - 'subtitlesformat': "best[ext={}]".format(file_formats.VTT), - 'quiet': True, - 'no_warnings': True + "skip_download": True, + "writesubtitles": True, + "subtitleslangs": [self.youtube_language], + "subtitlesformat": "best[ext={}]".format(file_formats.VTT), + "quiet": True, + "no_warnings": True, } - download_ext = ".{lang}.{ext}".format(lang=self.youtube_language, ext=file_formats.VTT) - return download_from_web(self.youtube_url, settings, file_format=file_formats.VTT, download_ext=download_ext) + download_ext = ".{lang}.{ext}".format( + lang=self.youtube_language, ext=file_formats.VTT + ) + return download_from_web( + self.youtube_url, + settings, + file_format=file_formats.VTT, + download_ext=download_ext, + ) class SubtitleFile(DownloadFile): @@ -735,7 +839,7 @@ def __init__(self, path, **kwargs): """ If `subtitlesformat` arg is empty, then type will be detected and converted if supported """ - self.subtitlesformat = kwargs.get('subtitlesformat', None) + self.subtitlesformat = kwargs.get("subtitlesformat", None) if "subtitlesformat" in kwargs: del kwargs["subtitlesformat"] super(SubtitleFile, self).__init__(path, **kwargs) @@ -753,13 +857,23 @@ def validate(self): assert self.path, "{} must have a path".format(self.__class__.__name__) ext = extract_path_ext(self.path, default_ext=self.subtitlesformat) convertible_exts = CONVERTIBLE_FORMATS[self.get_preset()] - if ext != self.default_ext and ext not in convertible_exts and self.subtitlesformat is None: - raise ValueError('Incompatible extension {} for SubtitleFile at {}'.format(ext, self.path)) + if ( + ext != self.default_ext + and ext not in convertible_exts + and self.subtitlesformat is None + ): + raise ValueError( + "Incompatible extension {} for SubtitleFile at {}".format( + ext, self.path + ) + ) def process_file(self): self.validate() - caught_errors = HTTP_CAUGHT_EXCEPTIONS + \ - (InvalidSubtitleFormatError, InvalidSubtitleLanguageError) + caught_errors = HTTP_CAUGHT_EXCEPTIONS + ( + InvalidSubtitleFormatError, + InvalidSubtitleLanguageError, + ) try: self.filename = self.download_and_transform_file(self.path) @@ -786,16 +900,20 @@ def download_and_transform_file(self, path): fdin, temp_in_file_name = tempfile.mkstemp() fdout, temp_out_file_name = tempfile.mkstemp() - with open(temp_in_file_name, mode="w+b") as temp_in_file,\ - open(temp_out_file_name, mode="w+b") as temp_out_file: + with open(temp_in_file_name, mode="w+b") as temp_in_file, open( + temp_out_file_name, mode="w+b" + ) as temp_out_file: write_and_get_hash(path, temp_in_file) temp_in_file.seek(0) - converter = build_subtitle_converter_from_file(temp_in_file.name, self.subtitlesformat) + converter = build_subtitle_converter_from_file( + temp_in_file.name, self.subtitlesformat + ) # We'll assume the provided file is in the passed language in this case - if len(converter.get_language_codes()) == 1 \ - and converter.has_language(LANGUAGE_CODE_UNKNOWN): + if len(converter.get_language_codes()) == 1 and converter.has_language( + LANGUAGE_CODE_UNKNOWN + ): converter.replace_unknown_language(self.language) convert_lang_code = self.language @@ -810,14 +928,15 @@ def download_and_transform_file(self, path): break else: raise InvalidSubtitleLanguageError( - "Missing language '{}' in subtitle file".format(self.language)) + "Missing language '{}' in subtitle file".format(self.language) + ) converter.write(temp_out_file.name, convert_lang_code) temp_out_file.seek(0) file_hash = get_hash(temp_out_file.name) - filename = '{0}.{ext}'.format(file_hash, ext=file_formats.VTT) + filename = "{0}.{ext}".format(file_hash, ext=file_formats.VTT) temp_out_file.seek(0) copy_file_to_storage(filename, temp_out_file) @@ -831,15 +950,14 @@ def download_and_transform_file(self, path): class Base64ImageFile(ThumbnailPresetMixin, File): - def __init__(self, encoding, **kwargs): self.encoding = encoding super(Base64ImageFile, self).__init__(**kwargs) def process_file(self): - """ process_file: Writes base64 encoding to file - Args: None - Returns: filename + """process_file: Writes base64 encoding to file + Args: None + Returns: filename """ self.filename = self.convert_base64_to_file() config.LOGGER.info("\t--- Converted base64 image to {}".format(self.filename)) @@ -848,7 +966,7 @@ def process_file(self): def convert_base64_to_file(self): # Get hash of content for cache key hashed_content = hashlib.md5() - hashed_content.update(self.encoding.encode('utf-8')) + hashed_content.update(self.encoding.encode("utf-8")) key = "ENCODED: {} (base64 encoded)".format(hashed_content.hexdigest()) cache_file = get_cache_filename(key) @@ -858,9 +976,15 @@ def convert_base64_to_file(self): config.LOGGER.info("\tConverting base64 to file") extension = get_base64_encoding(self.encoding).group(1) - assert extension in [file_formats.PNG, file_formats.JPG, file_formats.JPEG], "Base64 files must be images in jpg or png format" - - tempf = tempfile.NamedTemporaryFile(suffix=".{}".format(extension), delete=False) + assert extension in [ + file_formats.PNG, + file_formats.JPG, + file_formats.JPEG, + ], "Base64 files must be images in jpg or png format" + + tempf = tempfile.NamedTemporaryFile( + suffix=".{}".format(extension), delete=False + ) tempf.close() write_base64_to_file(self.encoding, tempf.name) filename = "{}.{}".format(get_hash(tempf.name), file_formats.PNG) @@ -895,7 +1019,9 @@ class _ExerciseGraphieFile(DownloadFile): default_ext = file_formats.GRAPHIE def __init__(self, path, **kwargs): - self.original_filename = path.split("/")[-1].split(os.path.sep)[-1].split(".")[0] + self.original_filename = ( + path.split("/")[-1].split(os.path.sep)[-1].split(".")[0] + ) super(_ExerciseGraphieFile, self).__init__(path, **kwargs) def get_preset(self): @@ -905,16 +1031,24 @@ def get_replacement_str(self): return self.path.split("/")[-1].split(".")[0] or self.path def process_file(self): - """ download: download a web+graphie file - Args: None - Returns: None + """download: download a web+graphie file + Args: None + Returns: None """ try: self.filename = self.generate_graphie_file() config.LOGGER.info("\t--- Generated graphie {}".format(self.filename)) return self.filename # Catch errors related to reading file path and handle silently - except (HTTPError, ConnectionError, InvalidURL, UnicodeDecodeError, UnicodeError, InvalidSchema, IOError) as err: + except ( + HTTPError, + ConnectionError, + InvalidURL, + UnicodeDecodeError, + UnicodeError, + InvalidSchema, + IOError, + ) as err: self.error = err config.FAILED_FILES.append(self) @@ -928,8 +1062,10 @@ def generate_graphie_file(self): # Create graphie file combining svg and json files with tempfile.TemporaryFile() as tempf: # Initialize hash and files - delimiter = bytes(exercises.GRAPHIE_DELIMITER, 'UTF-8') - config.LOGGER.info("\tDownloading graphie {}".format(self.original_filename)) + delimiter = bytes(exercises.GRAPHIE_DELIMITER, "UTF-8") + config.LOGGER.info( + "\tDownloading graphie {}".format(self.original_filename) + ) # Write to graphie file hash = write_and_get_hash(self.path + ".svg", tempf) @@ -948,6 +1084,7 @@ def generate_graphie_file(self): # EXTRACTED THUMBNAILS ################################################################################ + class ExtractedThumbnailFile(ThumbnailFile): extractor_kwargs = {} # subclass can specify additional options @@ -958,7 +1095,9 @@ def process_file(self): Returns: filename or None """ config.LOGGER.info("\t--- Extracting thumbnail from {}".format(self.path)) - tempf = tempfile.NamedTemporaryFile(suffix=".{}".format(file_formats.PNG), delete=False) + tempf = tempfile.NamedTemporaryFile( + suffix=".{}".format(file_formats.PNG), delete=False + ) tempf.close() try: self.extractor_fun(self.path, tempf.name, **self.extractor_kwargs) @@ -982,43 +1121,46 @@ def extractor_fun(self, fpath_in, thumbpath_out, **kwargs): thumbpath_out: the destination path to write thumbnail to (temp file) **kwargs: any additional class-specific arguments passed in """ - raise NotImplementedError('The subclass must implement this method.') + raise NotImplementedError("The subclass must implement this method.") class ExtractedPdfThumbnailFile(ExtractedThumbnailFile): - extractor_kwargs = {'page_number': 0, 'crop': None} + extractor_kwargs = {"page_number": 0, "crop": None} def extractor_fun(self, fpath_in, thumbpath_out, **kwargs): create_image_from_pdf_page(fpath_in, thumbpath_out, **kwargs) class ExtractedEPubThumbnailFile(ExtractedThumbnailFile): - extractor_kwargs = {'crop': None} + extractor_kwargs = {"crop": None} def extractor_fun(self, fpath_in, thumbpath_out, **kwargs): create_image_from_epub(fpath_in, thumbpath_out, **kwargs) class ExtractedHTMLZipThumbnailFile(ExtractedThumbnailFile): - extractor_kwargs = {'crop': "smart"} + extractor_kwargs = {"crop": "smart"} def extractor_fun(self, fpath_in, thumbpath_out, **kwargs): create_image_from_zip(fpath_in, thumbpath_out, **kwargs) class ExtractedVideoThumbnailFile(ExtractedThumbnailFile): - extractor_kwargs = {'overwrite': True} + extractor_kwargs = {"overwrite": True} def extractor_fun(self, fpath_in, thumbpath_out, **kwargs): extract_thumbnail_from_video(fpath_in, thumbpath_out, **kwargs) + class TiledThumbnailFile(ThumbnailPresetMixin, File): allowed_formats = [file_formats.JPG, file_formats.JPEG, file_formats.PNG] def __init__(self, source_nodes, **kwargs): self.sources = [] for n in source_nodes: - images = [f for f in n.files if isinstance(f, ThumbnailFile) and f.get_filename()] + images = [ + f for f in n.files if isinstance(f, ThumbnailFile) and f.get_filename() + ] if len(images) > 0: self.sources.append(images[0]) super(TiledThumbnailFile, self).__init__(**kwargs) @@ -1037,8 +1179,13 @@ def generate_tiled_image(self): else: return None config.LOGGER.info("\tGenerating tiled thumbnail.") - images = [config.get_storage_path(f.get_filename()) for f in self.sources[:num_pictures]] - with tempfile.NamedTemporaryFile(suffix=".{}".format(file_formats.PNG)) as tempf: + images = [ + config.get_storage_path(f.get_filename()) + for f in self.sources[:num_pictures] + ] + with tempfile.NamedTemporaryFile( + suffix=".{}".format(file_formats.PNG) + ) as tempf: tempf.close() create_tiled_image(images, tempf.name) filename = "{}.{}".format(get_hash(tempf.name), file_formats.PNG) diff --git a/ricecooker/classes/licenses.py b/ricecooker/classes/licenses.py index f40b995a..339a3218 100644 --- a/ricecooker/classes/licenses.py +++ b/ricecooker/classes/licenses.py @@ -1,8 +1,8 @@ # License models +from le_utils.constants import licenses -from ..exceptions import UnknownLicenseError from .. import config -from le_utils.constants import licenses +from ..exceptions import UnknownLicenseError def get_license(license_id, copyright_holder=None, description=None): @@ -23,15 +23,23 @@ def get_license(license_id, copyright_holder=None, description=None): elif license_id == licenses.PUBLIC_DOMAIN: return PublicDomainLicense(copyright_holder=copyright_holder) elif license_id == licenses.SPECIAL_PERMISSIONS: - return SpecialPermissionsLicense(copyright_holder=copyright_holder, description=description) + return SpecialPermissionsLicense( + copyright_holder=copyright_holder, description=description + ) else: - raise UnknownLicenseError("{} is not a valid license id. (Valid license are {})".format(license_id, [l[0] for l in licenses.choices])) + raise UnknownLicenseError( + "{} is not a valid license id. (Valid license are {})".format( + license_id, [l[0] for l in licenses.choices] + ) + ) class License(object): - license_id = None # (str): content's license based on le_utils.constants.licenses - copyright_holder = None # (str): name of person or organization who owns license (optional) - description = None # (str): description of the license (optional) + license_id = None # (str): content's license based on le_utils.constants.licenses + copyright_holder = ( + None # (str): name of person or organization who owns license (optional) + ) + description = None # (str): description of the license (optional) require_copyright_holder = True def __init__(self, copyright_holder=None, description=None): @@ -42,121 +50,161 @@ def get_id(self): return self.license_id def validate(self): - assert not self.require_copyright_holder or self.copyright_holder != "", "Assertion Failed: {} License requires a copyright holder".format(self.license_id) - assert isinstance(self.copyright_holder, str), "Assertion Failed: Copyright holder must be a string" + assert ( + not self.require_copyright_holder or self.copyright_holder != "" + ), "Assertion Failed: {} License requires a copyright holder".format( + self.license_id + ) + assert isinstance( + self.copyright_holder, str + ), "Assertion Failed: Copyright holder must be a string" def truncate_fields(self): - if self.description and len(self.description) > config.MAX_LICENSE_DESCRIPTION_LENGTH: - config.print_truncate("license_description", self.license_id, self.description) - self.description = self.description[:config.MAX_LICENSE_DESCRIPTION_LENGTH] - - if self.copyright_holder and len(self.copyright_holder) > config.MAX_COPYRIGHT_HOLDER_LENGTH: - config.print_truncate("copyright_holder", self.license_id, self.copyright_holder) - self.copyright_holder = self.copyright_holder[:config.MAX_COPYRIGHT_HOLDER_LENGTH] + if ( + self.description + and len(self.description) > config.MAX_LICENSE_DESCRIPTION_LENGTH + ): + config.print_truncate( + "license_description", self.license_id, self.description + ) + self.description = self.description[: config.MAX_LICENSE_DESCRIPTION_LENGTH] + + if ( + self.copyright_holder + and len(self.copyright_holder) > config.MAX_COPYRIGHT_HOLDER_LENGTH + ): + config.print_truncate( + "copyright_holder", self.license_id, self.copyright_holder + ) + self.copyright_holder = self.copyright_holder[ + : config.MAX_COPYRIGHT_HOLDER_LENGTH + ] def as_dict(self): - return {'license_id': self.license_id, - 'copyright_holder': self.copyright_holder, - 'description': self.description } + return { + "license_id": self.license_id, + "copyright_holder": self.copyright_holder, + "description": self.description, + } + class CC_BYLicense(License): """ - The Attribution License lets others distribute, remix, tweak, - and build upon your work, even commercially, as long as they credit - you for the original creation. This is the most accommodating of - licenses offered. Recommended for maximum dissemination and use of - licensed materials. + The Attribution License lets others distribute, remix, tweak, + and build upon your work, even commercially, as long as they credit + you for the original creation. This is the most accommodating of + licenses offered. Recommended for maximum dissemination and use of + licensed materials. - Reference: https://creativecommons.org/licenses/by/4.0 + Reference: https://creativecommons.org/licenses/by/4.0 """ + license_id = licenses.CC_BY + class CC_BY_SALicense(License): """ - The Attribution-ShareAlike License lets others remix, tweak, and - build upon your work even for commercial purposes, as long as they - credit you and license their new creations under the identical terms. - This license is often compared to "copyleft" free and open source - software licenses. All new works based on yours will carry the same - license, so any derivatives will also allow commercial use. This is - the license used by Wikipedia, and is recommended for materials that - would benefit from incorporating content from Wikipedia and similarly - licensed projects. + The Attribution-ShareAlike License lets others remix, tweak, and + build upon your work even for commercial purposes, as long as they + credit you and license their new creations under the identical terms. + This license is often compared to "copyleft" free and open source + software licenses. All new works based on yours will carry the same + license, so any derivatives will also allow commercial use. This is + the license used by Wikipedia, and is recommended for materials that + would benefit from incorporating content from Wikipedia and similarly + licensed projects. - Reference: https://creativecommons.org/licenses/by-sa/4.0 + Reference: https://creativecommons.org/licenses/by-sa/4.0 """ + license_id = licenses.CC_BY_SA + class CC_BY_NDLicense(License): """ - The Attribution-NoDerivs License allows for redistribution, commercial - and non-commercial, as long as it is passed along unchanged and in - whole, with credit to you. + The Attribution-NoDerivs License allows for redistribution, commercial + and non-commercial, as long as it is passed along unchanged and in + whole, with credit to you. - Reference: https://creativecommons.org/licenses/by-nd/4.0 + Reference: https://creativecommons.org/licenses/by-nd/4.0 """ + license_id = licenses.CC_BY_ND + class CC_BY_NCLicense(License): """ - The Attribution-NonCommercial License lets others remix, tweak, and - build upon your work non-commercially, and although their new works - must also acknowledge you and be non-commercial, they don't have to - license their derivative works on the same terms. + The Attribution-NonCommercial License lets others remix, tweak, and + build upon your work non-commercially, and although their new works + must also acknowledge you and be non-commercial, they don't have to + license their derivative works on the same terms. - Reference: https://creativecommons.org/licenses/by-nc/4.0 + Reference: https://creativecommons.org/licenses/by-nc/4.0 """ + license_id = licenses.CC_BY_NC + class CC_BY_NC_SALicense(License): """ - The Attribution-NonCommercial-ShareAlike License lets others remix, tweak, - and build upon your work non-commercially, as long as they credit you and - license their new creations under the identical terms. + The Attribution-NonCommercial-ShareAlike License lets others remix, tweak, + and build upon your work non-commercially, as long as they credit you and + license their new creations under the identical terms. - Reference: https://creativecommons.org/licenses/by-nc-sa/4.0 + Reference: https://creativecommons.org/licenses/by-nc-sa/4.0 """ + license_id = licenses.CC_BY_NC_SA + class CC_BY_NC_NDLicense(License): """ - The Attribution-NonCommercial-NoDerivs License is the most restrictive of - our six main licenses, only allowing others to download your works and share - them with others as long as they credit you, but they can't change them in - any way or use them commercially. + The Attribution-NonCommercial-NoDerivs License is the most restrictive of + our six main licenses, only allowing others to download your works and share + them with others as long as they credit you, but they can't change them in + any way or use them commercially. - Reference: https://creativecommons.org/licenses/by-nc-nd/4.0 + Reference: https://creativecommons.org/licenses/by-nc-nd/4.0 """ + license_id = licenses.CC_BY_NC_ND + class AllRightsLicense(License): """ - The All Rights Reserved License indicates that the copyright holder reserves, - or holds for their own use, all the rights provided by copyright law under - one specific copyright treaty. + The All Rights Reserved License indicates that the copyright holder reserves, + or holds for their own use, all the rights provided by copyright law under + one specific copyright treaty. - Reference: http://www.allrights-reserved.com + Reference: http://www.allrights-reserved.com """ + license_id = licenses.ALL_RIGHTS_RESERVED + class PublicDomainLicense(License): """ - Public Domain work has been identified as being free of known restrictions - under copyright law, including all related and neighboring rights. + Public Domain work has been identified as being free of known restrictions + under copyright law, including all related and neighboring rights. - Reference: https://creativecommons.org/publicdomain/mark/1.0 + Reference: https://creativecommons.org/publicdomain/mark/1.0 """ + require_copyright_holder = False license_id = licenses.PUBLIC_DOMAIN + class SpecialPermissionsLicense(License): """ - Special Permissions is a custom license to use when the current licenses - do not apply to the content. The owner of this license is responsible for - creating a description of what this license entails. + Special Permissions is a custom license to use when the current licenses + do not apply to the content. The owner of this license is responsible for + creating a description of what this license entails. """ + license_id = licenses.SPECIAL_PERMISSIONS def __init__(self, copyright_holder=None, description=None): assert description, "Special Permissions licenses must have a description" - super(SpecialPermissionsLicense, self).__init__(copyright_holder=copyright_holder, description=description) + super(SpecialPermissionsLicense, self).__init__( + copyright_holder=copyright_holder, description=description + ) diff --git a/ricecooker/classes/nodes.py b/ricecooker/classes/nodes.py index e693c6d0..502ca627 100644 --- a/ricecooker/classes/nodes.py +++ b/ricecooker/classes/nodes.py @@ -1,25 +1,42 @@ # Node models to represent channel's tree - +import csv import json -import uuid import os -import csv +import uuid -from le_utils.constants import content_kinds, exercises, file_formats, format_presets, languages, roles +from le_utils.constants import content_kinds +from le_utils.constants import exercises +from le_utils.constants import file_formats +from le_utils.constants import format_presets +from le_utils.constants import languages +from le_utils.constants import roles -from .licenses import License -from .. import config, __version__ +from .. import __version__ +from .. import config from ..exceptions import InvalidNodeException +from .licenses import License MASTERY_MODELS = [id for id, name in exercises.MASTERY_MODELS] ROLES = [id for id, name in roles.choices] + class Node(object): """ Node: model to represent all nodes in the tree """ + license = None language = None - def __init__(self, title, language=None, description=None, thumbnail=None, files=None, derive_thumbnail=False, node_modifications = {}, extra_fields=None): + def __init__( + self, + title, + language=None, + description=None, + thumbnail=None, + files=None, + derive_thumbnail=False, + node_modifications={}, + extra_fields=None, + ): self.files = [] self.children = [] self.descendants = [] @@ -52,44 +69,53 @@ def set_language(self, language): def __str__(self): count = self.count() - metadata = "{0} {1}".format(count, "descendant" if count == 1 else "descendants") - return "{title} ({kind}): {metadata}".format(title=self.title, kind=self.__class__.__name__, metadata=metadata) + metadata = "{0} {1}".format( + count, "descendant" if count == 1 else "descendants" + ) + return "{title} ({kind}): {metadata}".format( + title=self.title, kind=self.__class__.__name__, metadata=metadata + ) def truncate_fields(self): if len(self.title) > config.MAX_TITLE_LENGTH: config.print_truncate("title", self.source_id, self.title, kind=self.kind) - self.title = self.title[:config.MAX_TITLE_LENGTH] + self.title = self.title[: config.MAX_TITLE_LENGTH] if self.source_id and len(self.source_id) > config.MAX_SOURCE_ID_LENGTH: - config.print_truncate("source_id", self.source_id, self.source_id, kind=self.kind) - self.source_id = self.source_id[:config.MAX_SOURCE_ID_LENGTH] + config.print_truncate( + "source_id", self.source_id, self.source_id, kind=self.kind + ) + self.source_id = self.source_id[: config.MAX_SOURCE_ID_LENGTH] for f in self.files: f.truncate_fields() def to_dict(self): - """ to_dict: puts data in format CC expects - Args: None - Returns: dict of channel data + """to_dict: puts data in format CC expects + Args: None + Returns: dict of channel data """ pass def add_child(self, node): - """ add_child: Adds child node to node - Args: node to add as child - Returns: None + """add_child: Adds child node to node + Args: node to add as child + Returns: None """ assert isinstance(node, Node), "Child node must be a subclass of Node" node.parent = self self.children += [node] def add_file(self, file_to_add): - """ add_file: Add to node's associated files - Args: file_to_add (File): file model to add to node - Returns: None + """add_file: Add to node's associated files + Args: file_to_add (File): file model to add to node + Returns: None """ from .files import File - assert isinstance(file_to_add, File), "Files being added must be instances of a subclass of File class" + + assert isinstance( + file_to_add, File + ), "Files being added must be instances of a subclass of File class" file_to_add.node = self if file_to_add not in self.files: self.files.append(file_to_add) @@ -104,17 +130,19 @@ def generate_thumbnail(self): def has_thumbnail(self): from .files import ThumbnailFile + return any(f for f in self.files if isinstance(f, ThumbnailFile)) # TODO deep check: f.process_file() and check f.filename is not None def set_thumbnail(self, thumbnail): - """ set_thumbnail: Set node's thumbnail - Args: thumbnail (ThumbnailFile): file model to add to node - Returns: None + """set_thumbnail: Set node's thumbnail + Args: thumbnail (ThumbnailFile): file model to add to node + Returns: None """ self.thumbnail = thumbnail if isinstance(self.thumbnail, str): from .files import ThumbnailFile + self.thumbnail = ThumbnailFile(path=self.thumbnail) if self.thumbnail: @@ -169,9 +197,9 @@ def process_files(self): return filenames def count(self): - """ count: get number of nodes in tree - Args: None - Returns: int + """count: get number of nodes in tree + Args: None + Returns: int """ total = len(self.children) for child in self.children: @@ -179,9 +207,9 @@ def count(self): return total def get_topic_count(self): - """ get_topic_count: get number of topics in tree - Args: None - Returns: int + """get_topic_count: get number of topics in tree + Args: None + Returns: int """ total = 0 if self.kind == content_kinds.TOPIC or self.kind == "Channel": @@ -200,60 +228,60 @@ def get_non_topic_descendants(self): return self.descendants def print_tree(self, indent=2): - """ print_tree: prints out structure of tree - Args: indent (int): What level of indentation at which to start printing - Returns: None + """print_tree: prints out structure of tree + Args: indent (int): What level of indentation at which to start printing + Returns: None """ - config.LOGGER.info("{indent}{data}".format(indent=" " * indent, data=str(self))) + config.LOGGER.info( + "{indent}{data}".format(indent=" " * indent, data=str(self)) + ) for child in self.children: child.print_tree(indent + 1) def get_json_tree(self): tree = self.to_dict() if len(self.children) > 0: - tree['children'] = [] + tree["children"] = [] for child in self.children: - tree['children'].append(child.get_json_tree()) + tree["children"].append(child.get_json_tree()) return tree - - def save_channel_children_to_csv(self, metadata_csv, structure_string = ''): + def save_channel_children_to_csv(self, metadata_csv, structure_string=""): # Not including channel title in topic structure is_channel = isinstance(self, ChannelNode) if not is_channel: # Build out tag string - tags_string = ','.join(self.tags) - new_title = self.node_modifications.get('New Title') or '' - new_description = self.node_modifications.get('New Description') or '' - new_tags = self.node_modifications.get('New Tags') or '' + tags_string = ",".join(self.tags) + new_title = self.node_modifications.get("New Title") or "" + new_description = self.node_modifications.get("New Description") or "" + new_tags = self.node_modifications.get("New Tags") or "" # New Tags is being saved as a list. Check if list and if so, join to correctly write it to csv if isinstance(new_tags, list): - new_tags = ','.join(new_tags) + new_tags = ",".join(new_tags) record = [ self.source_id, structure_string, self.title, - new_title, # New Title + new_title, # New Title self.description, new_description, # New Description tags_string, - new_tags, # New Tags - '' # Last Modified + new_tags, # New Tags + "", # Last Modified ] metadata_csv.writerow(record) current_level = self.title # add current level to structure_string_list - if structure_string == '': + if structure_string == "": structure_string = self.title else: - structure_string += '/' + self.title + structure_string += "/" + self.title for child in self.children: child.save_channel_children_to_csv(metadata_csv, structure_string) - def validate_tree(self): """ Validate all nodes in this tree recusively. @@ -266,42 +294,60 @@ def validate_tree(self): return True def validate(self): - """ validate: Makes sure node is valid - Args: None - Returns: boolean indicating if node is valid + """validate: Makes sure node is valid + Args: None + Returns: boolean indicating if node is valid """ from .files import File - assert self.source_id is not None, "Assumption Failed: Node must have a source_id" - assert isinstance(self.title, str), "Assumption Failed: Node title is not a string" - assert len(self.title.strip()) > 0, "Assumption Failed: Node title cannot be empty" - assert isinstance(self.description, str) or self.description is None, "Assumption Failed: Node description is not a string" - assert isinstance(self.children, list), "Assumption Failed: Node children is not a list" + assert ( + self.source_id is not None + ), "Assumption Failed: Node must have a source_id" + assert isinstance( + self.title, str + ), "Assumption Failed: Node title is not a string" + assert ( + len(self.title.strip()) > 0 + ), "Assumption Failed: Node title cannot be empty" + assert ( + isinstance(self.description, str) or self.description is None + ), "Assumption Failed: Node description is not a string" + assert isinstance( + self.children, list + ), "Assumption Failed: Node children is not a list" for f in self.files: assert isinstance(f, File), "Assumption Failed: files must be file class" f.validate() source_ids = [c.source_id for c in self.children] duplicates = set([x for x in source_ids if source_ids.count(x) > 1]) - assert len(duplicates) == 0, "Assumption Failed: Node must have unique source id among siblings ({} appears multiple times)".format(duplicates) + assert ( + len(duplicates) == 0 + ), "Assumption Failed: Node must have unique source id among siblings ({} appears multiple times)".format( + duplicates + ) return True class ChannelNode(Node): - """ Model representing the channel you are creating + """Model representing the channel you are creating - Used to store metadata on channel that is being created + Used to store metadata on channel that is being created - Attributes: - source_id (str): channel's unique id - source_domain (str): who is providing the content (e.g. learningequality.org) - title (str): name of channel - description (str): description of the channel (optional) - thumbnail (str): file path or url of channel's thumbnail (optional) - files ([]): list of file objects for node (optional) + Attributes: + source_id (str): channel's unique id + source_domain (str): who is providing the content (e.g. learningequality.org) + title (str): name of channel + description (str): description of the channel (optional) + thumbnail (str): file path or url of channel's thumbnail (optional) + files ([]): list of file objects for node (optional) """ + kind = "Channel" - def __init__(self, source_id, source_domain, tagline=None, channel_id=None, *args, **kwargs): + + def __init__( + self, source_id, source_domain, tagline=None, channel_id=None, *args, **kwargs + ): # Map parameters to model variables self.channel_id = channel_id self.source_domain = source_domain @@ -321,23 +367,27 @@ def get_node_id(self): def truncate_fields(self): if self.description and len(self.description) > config.MAX_DESCRIPTION_LENGTH: - config.print_truncate("description", self.source_id, self.description, kind=self.kind) - self.description = self.description[:config.MAX_DESCRIPTION_LENGTH] + config.print_truncate( + "description", self.source_id, self.description, kind=self.kind + ) + self.description = self.description[: config.MAX_DESCRIPTION_LENGTH] if self.tagline and len(self.tagline) > config.MAX_TAGLINE_LENGTH: - config.print_truncate("tagline", self.source_id, self.tagline, kind=self.kind) - self.tagline = self.tagline[:config.MAX_TAGLINE_LENGTH] + config.print_truncate( + "tagline", self.source_id, self.tagline, kind=self.kind + ) + self.tagline = self.tagline[: config.MAX_TAGLINE_LENGTH] super(ChannelNode, self).truncate_fields() def to_dict(self): - """ to_dict: puts channel data into the format that Kolibri Studio expects - Args: None - Returns: dict of channel data + """to_dict: puts channel data into the format that Kolibri Studio expects + Args: None + Returns: dict of channel data """ return { "id": self.channel_id or self.get_node_id().hex, "name": self.title, "thumbnail": self.thumbnail.filename if self.thumbnail else None, - "language" : self.language, + "language": self.language, "description": self.description or "", "tagline": self.tagline or "", "license": self.license, @@ -345,42 +395,65 @@ def to_dict(self): "source_id": self.source_id, "ricecooker_version": __version__, "extra_fields": json.dumps(self.extra_fields), - "files": [f.to_dict() for f in self.files if f and f.filename and not (self.thumbnail and self.thumbnail.filename is f.filename)], + "files": [ + f.to_dict() + for f in self.files + if f + and f.filename + and not (self.thumbnail and self.thumbnail.filename is f.filename) + ], } def validate(self): - """ validate: Makes sure channel is valid - Args: None - Returns: boolean indicating if channel is valid + """validate: Makes sure channel is valid + Args: None + Returns: boolean indicating if channel is valid """ try: - assert isinstance(self.source_domain, str), "Channel domain must be a string" + assert isinstance( + self.source_domain, str + ), "Channel domain must be a string" assert self.language, "Channel must have a language" return super(ChannelNode, self).validate() except AssertionError as ae: - raise InvalidNodeException("Invalid channel ({}): {} - {}".format(ae.args[0], self.title, self.__dict__)) + raise InvalidNodeException( + "Invalid channel ({}): {} - {}".format( + ae.args[0], self.title, self.__dict__ + ) + ) class TreeNode(Node): - """ Model representing the content nodes in the channel's tree - - Base model for different content node kinds (topic, video, exercise, etc.) - - Attributes: - source_id (str): content's original id - title (str): content's title - license (str or ): content's license - description (str): description of content (optional) - author (str): who created the content (optional) - aggregator (str): website or org hosting the content collection but not necessarily the creator or copyright holder (optional) - provider (str): organization that commissioned or is distributing the content (optional) - thumbnail (str): local path or url to thumbnail image (optional) - files ([]): list of file objects for node (optional) - tags ([str]): list of tags for node (optional) - extra_fields (dict): any additional data needed for node (optional) - domain_ns (str): who is providing the content (e.g. learningequality.org) (optional) + """Model representing the content nodes in the channel's tree + + Base model for different content node kinds (topic, video, exercise, etc.) + + Attributes: + source_id (str): content's original id + title (str): content's title + license (str or ): content's license + description (str): description of content (optional) + author (str): who created the content (optional) + aggregator (str): website or org hosting the content collection but not necessarily the creator or copyright holder (optional) + provider (str): organization that commissioned or is distributing the content (optional) + thumbnail (str): local path or url to thumbnail image (optional) + files ([]): list of file objects for node (optional) + tags ([str]): list of tags for node (optional) + extra_fields (dict): any additional data needed for node (optional) + domain_ns (str): who is providing the content (e.g. learningequality.org) (optional) """ - def __init__(self, source_id, title, author="", aggregator="", provider="", tags=None, domain_ns=None, **kwargs): + + def __init__( + self, + source_id, + title, + author="", + aggregator="", + provider="", + tags=None, + domain_ns=None, + **kwargs + ): # Map parameters to model variables assert isinstance(source_id, str), "source_id must be a string" self.source_id = source_id @@ -389,7 +462,9 @@ def __init__(self, source_id, title, author="", aggregator="", provider="", tags self.provider = provider or "" self.tags = tags or [] self.domain_ns = domain_ns - self.questions = self.questions if hasattr(self, 'questions') else [] # Needed for to_dict method + self.questions = ( + self.questions if hasattr(self, "questions") else [] + ) # Needed for to_dict method super(TreeNode, self).__init__(title, **kwargs) @@ -404,30 +479,37 @@ def get_content_id(self): return self.content_id def get_node_id(self): - assert self.parent, "Parent not found: node id must be calculated based on parent" + assert ( + self.parent + ), "Parent not found: node id must be calculated based on parent" if not self.node_id: - self.node_id = uuid.uuid5(self.parent.get_node_id(), self.get_content_id().hex) + self.node_id = uuid.uuid5( + self.parent.get_node_id(), self.get_content_id().hex + ) return self.node_id - def truncate_fields(self): if self.author and len(self.author) > config.MAX_AUTHOR_LENGTH: config.print_truncate("author", self.source_id, self.author, kind=self.kind) - self.author = self.author[:config.MAX_AUTHOR_LENGTH] + self.author = self.author[: config.MAX_AUTHOR_LENGTH] if self.aggregator and len(self.aggregator) > config.MAX_AGGREGATOR_LENGTH: - config.print_truncate("aggregator", self.source_id, self.aggregator, kind=self.kind) - self.aggregator = self.aggregator[:config.MAX_AGGREGATOR_LENGTH] + config.print_truncate( + "aggregator", self.source_id, self.aggregator, kind=self.kind + ) + self.aggregator = self.aggregator[: config.MAX_AGGREGATOR_LENGTH] if self.provider and len(self.provider) > config.MAX_PROVIDER_LENGTH: - config.print_truncate("provider", self.source_id, self.provider, kind=self.kind) - self.provider = self.provider[:config.MAX_PROVIDER_LENGTH] + config.print_truncate( + "provider", self.source_id, self.provider, kind=self.kind + ) + self.provider = self.provider[: config.MAX_PROVIDER_LENGTH] self.license and self.license.truncate_fields() super(TreeNode, self).truncate_fields() - def sort_children(self, key = None, reverse = False): + def sort_children(self, key=None, reverse=False): """ Sort children of TreeNode :param key: A Function to execute to decide the order. Default None @@ -436,19 +518,23 @@ def sort_children(self, key = None, reverse = False): # default natural sorting if not key: convert = lambda text: int(text) if text.isdigit() else text.lower() - key = lambda key: [ convert(re.sub(r'[^A-Za-z0-9]+', '', c.replace('&', 'and'))) for c in re.split('([0-9]+)', key.title) ] - self.children = sorted(self.children, key = key, reverse = reverse) + key = lambda key: [ + convert(re.sub(r"[^A-Za-z0-9]+", "", c.replace("&", "and"))) + for c in re.split("([0-9]+)", key.title) + ] + self.children = sorted(self.children, key=key, reverse=reverse) return self.children def to_dict(self): - """ to_dict: puts Topic or Content node data into the format that Kolibri Studio expects - Args: None - Returns: dict of channel data + """to_dict: puts Topic or Content node data into the format that Kolibri Studio expects + Args: None + Returns: dict of channel data """ return { - "title": self.node_modifications.get('New Title') or self.title, - "language" : self.language, - "description": self.node_modifications.get('New Description') or self.description, + "title": self.node_modifications.get("New Title") or self.title, + "language": self.language, + "description": self.node_modifications.get("New Description") + or self.description, "node_id": self.get_node_id().hex, "content_id": self.get_content_id().hex, "source_domain": self.domain_ns.hex, @@ -456,8 +542,10 @@ def to_dict(self): "author": self.author, "aggregator": self.aggregator, "provider": self.provider, - "files" : [f.to_dict() for f in self.files if f and f.filename], # Filter out failed downloads - "tags": self.node_modifications.get('New Tags') or self.tags, + "files": [ + f.to_dict() for f in self.files if f and f.filename + ], # Filter out failed downloads + "tags": self.node_modifications.get("New Tags") or self.tags, "kind": self.kind, "license": None, "license_description": None, @@ -467,35 +555,46 @@ def to_dict(self): } def validate(self): - """ validate: Makes sure content node is valid - Args: None - Returns: boolean indicating if content node is valid + """validate: Makes sure content node is valid + Args: None + Returns: boolean indicating if content node is valid """ - assert isinstance(self.author, str) , "Assumption Failed: Author is not a string" - assert isinstance(self.aggregator, str) , "Assumption Failed: Aggregator is not a string" - assert isinstance(self.provider, str) , "Assumption Failed: Provider is not a string" + assert isinstance(self.author, str), "Assumption Failed: Author is not a string" + assert isinstance( + self.aggregator, str + ), "Assumption Failed: Aggregator is not a string" + assert isinstance( + self.provider, str + ), "Assumption Failed: Provider is not a string" assert isinstance(self.files, list), "Assumption Failed: Files is not a list" - assert isinstance(self.questions, list), "Assumption Failed: Questions is not a list" - assert isinstance(self.extra_fields, dict), "Assumption Failed: Extra fields is not a dict" + assert isinstance( + self.questions, list + ), "Assumption Failed: Questions is not a list" + assert isinstance( + self.extra_fields, dict + ), "Assumption Failed: Extra fields is not a dict" assert isinstance(self.tags, list), "Assumption Failed: Tags is not a list" for tag in self.tags: - assert isinstance(tag, str) , "Assumption Failed: Tag is not a string" - assert len(tag) <= 50, "ERROR: tag " + tag + " is too long. Tags should be 50 chars or less." + assert isinstance(tag, str), "Assumption Failed: Tag is not a string" + assert len(tag) <= 50, ( + "ERROR: tag " + tag + " is too long. Tags should be 50 chars or less." + ) return super(TreeNode, self).validate() class TopicNode(TreeNode): - """ Model representing channel topics + """Model representing channel topics - Topic nodes are used to add organization to the channel's content + Topic nodes are used to add organization to the channel's content - Attributes: - source_id (str): content's original id - title (str): content's title - description (str): description of content (optional) - thumbnail (str): local path or url to thumbnail image (optional) - derive_thumbnail (bool): set to generate tiled thumbnail from children (optional) + Attributes: + source_id (str): content's original id + title (str): content's title + description (str): description of content (optional) + thumbnail (str): local path or url to thumbnail image (optional) + derive_thumbnail (bool): set to generate tiled thumbnail from children (optional) """ + kind = content_kinds.TOPIC def generate_thumbnail(self): @@ -503,65 +602,95 @@ def generate_thumbnail(self): Returns: a Thumbnail file or None. """ from .files import TiledThumbnailFile + return TiledThumbnailFile(self.get_non_topic_descendants()) def validate(self): - """ validate: Makes sure topic is valid - Args: None - Returns: boolean indicating if topic is valid + """validate: Makes sure topic is valid + Args: None + Returns: boolean indicating if topic is valid """ try: - assert self.kind == content_kinds.TOPIC, "Assumption Failed: Node is supposed to be a topic" + assert ( + self.kind == content_kinds.TOPIC + ), "Assumption Failed: Node is supposed to be a topic" return super(TopicNode, self).validate() except AssertionError as ae: - raise InvalidNodeException("Invalid node ({}): {} - {}".format(ae.args[0], self.title, self.__dict__)) + raise InvalidNodeException( + "Invalid node ({}): {} - {}".format( + ae.args[0], self.title, self.__dict__ + ) + ) class ContentNode(TreeNode): - """ Model representing the content nodes in the channel's tree - - Base model for different content node kinds (topic, video, exercise, etc.) - - Attributes: - source_id (str): content's original id - title (str): content's title - license (str or ): content's license - description (str): description of content (optional) - author (str): who created the content (optional) - aggregator (str): website or org hosting the content collection but not necessarily the creator or copyright holder (optional) - provider (str): organization that commissioned or is distributing the content (optional) - role (str): set to roles.COACH for teacher-facing materials (default roles.LEARNER) - thumbnail (str): local path or url to thumbnail image (optional) - derive_thumbnail (bool): set to generate thumbnail from content (optional) - files ([]): list of file objects for node (optional) - extra_fields (dict): any additional data needed for node (optional) - domain_ns (str): who is providing the content (e.g. learningequality.org) (optional) + """Model representing the content nodes in the channel's tree + + Base model for different content node kinds (topic, video, exercise, etc.) + + Attributes: + source_id (str): content's original id + title (str): content's title + license (str or ): content's license + description (str): description of content (optional) + author (str): who created the content (optional) + aggregator (str): website or org hosting the content collection but not necessarily the creator or copyright holder (optional) + provider (str): organization that commissioned or is distributing the content (optional) + role (str): set to roles.COACH for teacher-facing materials (default roles.LEARNER) + thumbnail (str): local path or url to thumbnail image (optional) + derive_thumbnail (bool): set to generate thumbnail from content (optional) + files ([]): list of file objects for node (optional) + extra_fields (dict): any additional data needed for node (optional) + domain_ns (str): who is providing the content (e.g. learningequality.org) (optional) """ + required_file_format = None - def __init__(self, source_id, title, license, role=roles.LEARNER, license_description=None, copyright_holder=None, **kwargs): + def __init__( + self, + source_id, + title, + license, + role=roles.LEARNER, + license_description=None, + copyright_holder=None, + **kwargs + ): self.role = role - self.set_license(license, copyright_holder=copyright_holder, description=license_description) + self.set_license( + license, copyright_holder=copyright_holder, description=license_description + ) super(ContentNode, self).__init__(source_id, title, **kwargs) def __str__(self): - metadata = "{0} {1}".format(len(self.files), "file" if len(self.files) == 1 else "files") - return "{title} ({kind}): {metadata}".format(title=self.title, kind=self.__class__.__name__, metadata=metadata) + metadata = "{0} {1}".format( + len(self.files), "file" if len(self.files) == 1 else "files" + ) + return "{title} ({kind}): {metadata}".format( + title=self.title, kind=self.__class__.__name__, metadata=metadata + ) def set_license(self, license, copyright_holder=None, description=None): # Add license (create model if it's just a path) if isinstance(license, str): from .licenses import get_license - license = get_license(license, copyright_holder=copyright_holder, description=description) + + license = get_license( + license, copyright_holder=copyright_holder, description=description + ) self.license = license def validate(self): - """ validate: Makes sure content node is valid - Args: None - Returns: boolean indicating if content node is valid + """validate: Makes sure content node is valid + Args: None + Returns: boolean indicating if content node is valid """ - assert self.role in ROLES, "Assumption Failed: Role must be one of the following {}".format(ROLES) - assert isinstance(self.license, str) or isinstance(self.license, License), "Assumption Failed: License is not a string or license object" + assert ( + self.role in ROLES + ), "Assumption Failed: Role must be one of the following {}".format(ROLES) + assert isinstance(self.license, str) or isinstance( + self.license, License + ), "Assumption Failed: License is not a string or license object" self.license.validate() # if self.required_file_format: # files_valid = False @@ -572,14 +701,15 @@ def validate(self): return super(ContentNode, self).validate() def to_dict(self): - """ to_dict: puts data in format CC expects - Args: None - Returns: dict of channel data + """to_dict: puts data in format CC expects + Args: None + Returns: dict of channel data """ return { - "title": self.node_modifications.get('New Title') or self.title, - "language" : self.language, - "description": self.node_modifications.get('New Description') or self.description, + "title": self.node_modifications.get("New Title") or self.title, + "language": self.language, + "description": self.node_modifications.get("New Description") + or self.description, "node_id": self.get_node_id().hex, "content_id": self.get_content_id().hex, "source_domain": self.domain_ns.hex, @@ -587,8 +717,10 @@ def to_dict(self): "author": self.author, "aggregator": self.aggregator, "provider": self.provider, - "files" : [f.to_dict() for f in filter(lambda x: x and x.filename, self.files)], # Filter out failed downloads - "tags": self.node_modifications.get('New Tags') or self.tags, + "files": [ + f.to_dict() for f in filter(lambda x: x and x.filename, self.files) + ], # Filter out failed downloads + "tags": self.node_modifications.get("New Tags") or self.tags, "kind": self.kind, "license": self.license.license_id, "license_description": self.license.description, @@ -600,24 +732,25 @@ def to_dict(self): class VideoNode(ContentNode): - """ Model representing videos in channel - - Videos must be mp4 or webm format - - Attributes: - source_id (str): content's original id - title (str): content's title - license (str or ): content's license - author (str): who created the content (optional) - aggregator (str): website or org hosting the content collection but not necessarily the creator or copyright holder (optional) - provider (str): organization that commissioned or is distributing the content (optional) - description (str): description of content (optional) - derive_thumbnail (bool): set to generate thumbnail from video (optional) - thumbnail (str): local path or url to thumbnail image (optional) - extra_fields (dict): any additional data needed for node (optional) - domain_ns (str): who is providing the content (e.g. learningequality.org) (optional) - files ([]): list of file objects for node (optional) + """Model representing videos in channel + + Videos must be mp4 or webm format + + Attributes: + source_id (str): content's original id + title (str): content's title + license (str or ): content's license + author (str): who created the content (optional) + aggregator (str): website or org hosting the content collection but not necessarily the creator or copyright holder (optional) + provider (str): organization that commissioned or is distributing the content (optional) + description (str): description of content (optional) + derive_thumbnail (bool): set to generate thumbnail from video (optional) + thumbnail (str): local path or url to thumbnail image (optional) + extra_fields (dict): any additional data needed for node (optional) + domain_ns (str): who is providing the content (e.g. learningequality.org) (optional) + files ([]): list of file objects for node (optional) """ + kind = content_kinds.VIDEO required_file_format = (file_formats.MP4, file_formats.WEBM) @@ -626,7 +759,12 @@ def __init__(self, source_id, title, license, **kwargs): def generate_thumbnail(self): from .files import VideoFile, WebVideoFile, ExtractedVideoThumbnailFile - video_files = [f for f in self.files if isinstance(f, VideoFile) or isinstance(f, WebVideoFile)] + + video_files = [ + f + for f in self.files + if isinstance(f, VideoFile) or isinstance(f, WebVideoFile) + ] if video_files: video_file = video_files[0] if video_file.filename and not video_file.error: @@ -635,31 +773,51 @@ def generate_thumbnail(self): return None def validate(self): - """ validate: Makes sure video is valid - Args: None - Returns: boolean indicating if video is valid + """validate: Makes sure video is valid + Args: None + Returns: boolean indicating if video is valid """ from .files import VideoFile, WebVideoFile, SubtitleFile, YouTubeSubtitleFile + try: - assert self.kind == content_kinds.VIDEO, "Assumption Failed: Node should be a video" - assert self.questions == [], "Assumption Failed: Video should not have questions" - assert len(self.files) > 0, "Assumption Failed: Video must have at least one video file" + assert ( + self.kind == content_kinds.VIDEO + ), "Assumption Failed: Node should be a video" + assert ( + self.questions == [] + ), "Assumption Failed: Video should not have questions" + assert ( + len(self.files) > 0 + ), "Assumption Failed: Video must have at least one video file" # Check if there are any .mp4 files if there are video files (other video types don't have paths) - assert any(f for f in self.files if isinstance(f, VideoFile) or isinstance(f, WebVideoFile)), "Assumption Failed: Video node should have at least one video file" + assert any( + f + for f in self.files + if isinstance(f, VideoFile) or isinstance(f, WebVideoFile) + ), "Assumption Failed: Video node should have at least one video file" # Ensure that there is only one subtitle file per language code new_files = [] language_codes_seen = set() for file in self.files: - if isinstance(file, SubtitleFile) or isinstance(file, YouTubeSubtitleFile): + if isinstance(file, SubtitleFile) or isinstance( + file, YouTubeSubtitleFile + ): language_code = file.language if language_code not in language_codes_seen: new_files.append(file) language_codes_seen.add(language_code) else: - file_info = file.path if hasattr(file, 'path') else file.youtube_url - config.LOGGER.warning('Skipping duplicate subs for ' + language_code + ' from ' + file_info) + file_info = ( + file.path if hasattr(file, "path") else file.youtube_url + ) + config.LOGGER.warning( + "Skipping duplicate subs for " + + language_code + + " from " + + file_info + ) else: new_files.append(file) self.files = new_files @@ -667,91 +825,134 @@ def validate(self): return super(VideoNode, self).validate() except AssertionError as ae: - raise InvalidNodeException("Invalid node ({}): {} - {}".format(ae.args[0], self.title, self.__dict__)) + raise InvalidNodeException( + "Invalid node ({}): {} - {}".format( + ae.args[0], self.title, self.__dict__ + ) + ) class AudioNode(ContentNode): - """ Model representing audio content in channel - - Audio must be in mp3 format - - Attributes: - source_id (str): content's original id - title (str): content's title - license (str or ): content's license - author (str): who created the content (optional) - aggregator (str): website or org hosting the content collection but not necessarily the creator or copyright holder (optional) - provider (str): organization that commissioned or is distributing the content (optional) - description (str): description of content (optional) - thumbnail (str): local path or url to thumbnail image (optional) - derive_thumbnail (bool): set to generate waveform thumbnail (optional) - extra_fields (dict): any additional data needed for node (optional) - domain_ns (str): who is providing the content (e.g. learningequality.org) (optional) - files ([]): list of file objects for node (optional) + """Model representing audio content in channel + + Audio must be in mp3 format + + Attributes: + source_id (str): content's original id + title (str): content's title + license (str or ): content's license + author (str): who created the content (optional) + aggregator (str): website or org hosting the content collection but not necessarily the creator or copyright holder (optional) + provider (str): organization that commissioned or is distributing the content (optional) + description (str): description of content (optional) + thumbnail (str): local path or url to thumbnail image (optional) + derive_thumbnail (bool): set to generate waveform thumbnail (optional) + extra_fields (dict): any additional data needed for node (optional) + domain_ns (str): who is providing the content (e.g. learningequality.org) (optional) + files ([]): list of file objects for node (optional) """ + kind = content_kinds.AUDIO required_file_format = file_formats.MP3 def validate(self): - """ validate: Makes sure audio is valid - Args: None - Returns: boolean indicating if audio is valid + """validate: Makes sure audio is valid + Args: None + Returns: boolean indicating if audio is valid """ from .files import AudioFile + try: - assert self.kind == content_kinds.AUDIO, "Assumption Failed: Node should be audio" - assert self.questions == [], "Assumption Failed: Audio should not have questions" - assert len(self.files) > 0, "Assumption Failed: Audio should have at least one file" - assert [f for f in self.files if isinstance(f, AudioFile)], "Assumption Failed: Audio should have at least one audio file" + assert ( + self.kind == content_kinds.AUDIO + ), "Assumption Failed: Node should be audio" + assert ( + self.questions == [] + ), "Assumption Failed: Audio should not have questions" + assert ( + len(self.files) > 0 + ), "Assumption Failed: Audio should have at least one file" + assert [ + f for f in self.files if isinstance(f, AudioFile) + ], "Assumption Failed: Audio should have at least one audio file" return super(AudioNode, self).validate() except AssertionError as ae: - raise InvalidNodeException("Invalid node ({}): {} - {}".format(ae.args[0], self.title, self.__dict__)) + raise InvalidNodeException( + "Invalid node ({}): {} - {}".format( + ae.args[0], self.title, self.__dict__ + ) + ) class DocumentNode(ContentNode): - """ Model representing documents in channel - - Documents must be in PDF or ePub format - - Attributes: - source_id (str): content's original id - title (str): content's title - license (str or ): content's license - author (str): who created the content (optional) - aggregator (str): website or org hosting the content collection but not necessarily the creator or copyright holder (optional) - provider (str): organization that commissioned or is distributing the content (optional) - description (str): description of content (optional) - thumbnail (str): local path or url to thumbnail image (optional) - derive_thumbnail (bool): automatically generate thumbnail (optional) - extra_fields (dict): any additional data needed for node (optional) - domain_ns (str): who is providing the content (e.g. learningequality.org) (optional) - files ([]): list of file objects for node (optional) + """Model representing documents in channel + + Documents must be in PDF or ePub format + + Attributes: + source_id (str): content's original id + title (str): content's title + license (str or ): content's license + author (str): who created the content (optional) + aggregator (str): website or org hosting the content collection but not necessarily the creator or copyright holder (optional) + provider (str): organization that commissioned or is distributing the content (optional) + description (str): description of content (optional) + thumbnail (str): local path or url to thumbnail image (optional) + derive_thumbnail (bool): automatically generate thumbnail (optional) + extra_fields (dict): any additional data needed for node (optional) + domain_ns (str): who is providing the content (e.g. learningequality.org) (optional) + files ([]): list of file objects for node (optional) """ + kind = content_kinds.DOCUMENT - required_file_format = file_formats.PDF # TODO(ivan) change ro allowed_formats + required_file_format = file_formats.PDF # TODO(ivan) change ro allowed_formats def validate(self): - """ validate: Makes sure document node contains at least one EPUB or PDF - Args: None - Returns: boolean indicating if document is valid + """validate: Makes sure document node contains at least one EPUB or PDF + Args: None + Returns: boolean indicating if document is valid """ from .files import DocumentFile, EPubFile + try: - assert self.kind == content_kinds.DOCUMENT, "Assumption Failed: Node should be a document" - assert self.questions == [], "Assumption Failed: Document should not have questions" - assert len(self.files) > 0, "Assumption Failed: Document should have at least one file" - assert [f for f in self.files if isinstance(f, DocumentFile) or isinstance(f, EPubFile)], \ - "Assumption Failed: Document should have at least one document file" + assert ( + self.kind == content_kinds.DOCUMENT + ), "Assumption Failed: Node should be a document" + assert ( + self.questions == [] + ), "Assumption Failed: Document should not have questions" + assert ( + len(self.files) > 0 + ), "Assumption Failed: Document should have at least one file" + assert [ + f + for f in self.files + if isinstance(f, DocumentFile) or isinstance(f, EPubFile) + ], "Assumption Failed: Document should have at least one document file" return super(DocumentNode, self).validate() except AssertionError as ae: - raise InvalidNodeException("Invalid node ({}): {} - {}".format(ae.args[0], self.title, self.__dict__)) + raise InvalidNodeException( + "Invalid node ({}): {} - {}".format( + ae.args[0], self.title, self.__dict__ + ) + ) def generate_thumbnail(self): - from .files import DocumentFile, EPubFile, ExtractedPdfThumbnailFile, ExtractedEPubThumbnailFile + from .files import ( + DocumentFile, + EPubFile, + ExtractedPdfThumbnailFile, + ExtractedEPubThumbnailFile, + ) + pdf_files = [f for f in self.files if isinstance(f, DocumentFile)] epub_files = [f for f in self.files if isinstance(f, EPubFile)] if pdf_files and epub_files: - raise InvalidNodeException("Invalid node (both PDF and ePub provided): {} - {}".format(self.title, self.__dict__)) + raise InvalidNodeException( + "Invalid node (both PDF and ePub provided): {} - {}".format( + self.title, self.__dict__ + ) + ) elif pdf_files: pdf_file = pdf_files[0] if pdf_file.filename and not pdf_file.error: @@ -766,30 +967,32 @@ def generate_thumbnail(self): class HTML5AppNode(ContentNode): - """ Model representing a zipped HTML5 application - - The zip file must contain a file called index.html, which will be the first page loaded. - All links (e.g. href and src) must be relative URLs, pointing to other files in the zip. - - Attributes: - source_id (str): content's original id - title (str): content's title - license (str or ): content's license - author (str): who created the content (optional) - aggregator (str): website or org hosting the content collection but not necessarily the creator or copyright holder (optional) - provider (str): organization that commissioned or is distributing the content (optional) - description (str): description of content (optional) - thumbnail (str): local path or url to thumbnail image (optional) - derive_thumbnail (bool): generate thumbnail from largest image inside zip (optional) - extra_fields (dict): any additional data needed for node (optional) - domain_ns (str): who is providing the content (e.g. learningequality.org) (optional) - files ([]): list of file objects for node (optional) + """Model representing a zipped HTML5 application + + The zip file must contain a file called index.html, which will be the first page loaded. + All links (e.g. href and src) must be relative URLs, pointing to other files in the zip. + + Attributes: + source_id (str): content's original id + title (str): content's title + license (str or ): content's license + author (str): who created the content (optional) + aggregator (str): website or org hosting the content collection but not necessarily the creator or copyright holder (optional) + provider (str): organization that commissioned or is distributing the content (optional) + description (str): description of content (optional) + thumbnail (str): local path or url to thumbnail image (optional) + derive_thumbnail (bool): generate thumbnail from largest image inside zip (optional) + extra_fields (dict): any additional data needed for node (optional) + domain_ns (str): who is providing the content (e.g. learningequality.org) (optional) + files ([]): list of file objects for node (optional) """ + kind = content_kinds.HTML5 required_file_format = file_formats.HTML5 def generate_thumbnail(self): from .files import HTMLZipFile, ExtractedHTMLZipThumbnailFile + html5_files = [f for f in self.files if isinstance(f, HTMLZipFile)] if html5_files: html_file = html5_files[0] @@ -800,79 +1003,105 @@ def generate_thumbnail(self): return None def validate(self): - """ validate: Makes sure HTML5 app is valid - Args: None - Returns: boolean indicating if HTML5 app is valid + """validate: Makes sure HTML5 app is valid + Args: None + Returns: boolean indicating if HTML5 app is valid """ from .files import HTMLZipFile + try: - assert self.kind == content_kinds.HTML5, "Assumption Failed: Node should be an HTML5 app" - assert self.questions == [], "Assumption Failed: HTML should not have questions" - assert [f for f in self.files if isinstance(f, HTMLZipFile)], "Assumption Failed: HTML should have at least one html file" + assert ( + self.kind == content_kinds.HTML5 + ), "Assumption Failed: Node should be an HTML5 app" + assert ( + self.questions == [] + ), "Assumption Failed: HTML should not have questions" + assert [ + f for f in self.files if isinstance(f, HTMLZipFile) + ], "Assumption Failed: HTML should have at least one html file" return super(HTML5AppNode, self).validate() except AssertionError as ae: - raise InvalidNodeException("Invalid node ({}): {} - {}".format(ae.args[0], self.title, self.__dict__)) + raise InvalidNodeException( + "Invalid node ({}): {} - {}".format( + ae.args[0], self.title, self.__dict__ + ) + ) class H5PAppNode(ContentNode): - """ Model representing a H5P content nodes - - The .h5p file is self-contained and inlcuding media and javascript libs. - - Attributes: - source_id (str): content's original id - title (str): content's title - license (str or ): content's license - author (str): who created the content (optional) - aggregator (str): website or org hosting the content collection but not necessarily the creator or copyright holder (optional) - provider (str): organization that commissioned or is distributing the content (optional) - description (str): description of content (optional) - thumbnail (str): local path or url to thumbnail image (optional) - extra_fields (dict): any additional data needed for node (optional) - domain_ns (str): who is providing the content (e.g. learningequality.org) (optional) - files ([]): list of file objects for node (optional) + """Model representing a H5P content nodes + + The .h5p file is self-contained and inlcuding media and javascript libs. + + Attributes: + source_id (str): content's original id + title (str): content's title + license (str or ): content's license + author (str): who created the content (optional) + aggregator (str): website or org hosting the content collection but not necessarily the creator or copyright holder (optional) + provider (str): organization that commissioned or is distributing the content (optional) + description (str): description of content (optional) + thumbnail (str): local path or url to thumbnail image (optional) + extra_fields (dict): any additional data needed for node (optional) + domain_ns (str): who is providing the content (e.g. learningequality.org) (optional) + files ([]): list of file objects for node (optional) """ + kind = content_kinds.H5P required_file_format = file_formats.H5P def validate(self): - """ validate: Makes sure H5P app is valid - Args: None - Returns: boolean indicating if H5P app is valid + """validate: Makes sure H5P app is valid + Args: None + Returns: boolean indicating if H5P app is valid """ from .files import H5PFile + try: - assert self.kind == content_kinds.H5P, "Assumption Failed: Node should be an H5P app" - assert self.questions == [], "Assumption Failed: HTML should not have questions" - assert [f for f in self.files if isinstance(f, H5PFile)], "Assumption Failed: H5PAppNode should have at least one h5p file" + assert ( + self.kind == content_kinds.H5P + ), "Assumption Failed: Node should be an H5P app" + assert ( + self.questions == [] + ), "Assumption Failed: HTML should not have questions" + assert [ + f for f in self.files if isinstance(f, H5PFile) + ], "Assumption Failed: H5PAppNode should have at least one h5p file" return super(H5PAppNode, self).validate() except AssertionError as ae: - raise InvalidNodeException("Invalid node ({}): {} - {}".format(ae.args[0], self.title, self.__dict__)) + raise InvalidNodeException( + "Invalid node ({}): {} - {}".format( + ae.args[0], self.title, self.__dict__ + ) + ) class ExerciseNode(ContentNode): - """ Model representing exercises in channel - - Exercises are sets of questions to assess learners' - understanding of the content - - Attributes: - source_id (str): content's original id - title (str): content's title - license (str or ): content's license - author (str): who created the content (optional) - aggregator (str): website or org hosting the content collection but not necessarily the creator or copyright holder (optional) - provider (str): organization that commissioned or is distributing the content (optional) - description (str): description of content (optional) - exercise_data ({mastery_model:str, randomize:bool, m:int, n:int}): data on mastery requirements (optional) - thumbnail (str): local path or url to thumbnail image (optional) - extra_fields (dict): any additional data needed for node (optional) - domain_ns (str): who is providing the content (e.g. learningequality.org) (optional) - questions ([]): list of question objects for node (optional) + """Model representing exercises in channel + + Exercises are sets of questions to assess learners' + understanding of the content + + Attributes: + source_id (str): content's original id + title (str): content's title + license (str or ): content's license + author (str): who created the content (optional) + aggregator (str): website or org hosting the content collection but not necessarily the creator or copyright holder (optional) + provider (str): organization that commissioned or is distributing the content (optional) + description (str): description of content (optional) + exercise_data ({mastery_model:str, randomize:bool, m:int, n:int}): data on mastery requirements (optional) + thumbnail (str): local path or url to thumbnail image (optional) + extra_fields (dict): any additional data needed for node (optional) + domain_ns (str): who is providing the content (e.g. learningequality.org) (optional) + questions ([]): list of question objects for node (optional) """ + kind = content_kinds.EXERCISE - def __init__(self, source_id, title, license, questions=None, exercise_data=None, **kwargs): + def __init__( + self, source_id, title, license, questions=None, exercise_data=None, **kwargs + ): self.questions = questions or [] # Set mastery model defaults if none provided @@ -881,21 +1110,29 @@ def __init__(self, source_id, title, license, questions=None, exercise_data=None if isinstance(exercise_data, str): exercise_data = {"mastery_model": exercise_data} - exercise_data.update({ - 'mastery_model': exercise_data.get('mastery_model', exercises.M_OF_N), - 'randomize': exercise_data.get('randomize', True), - }) + exercise_data.update( + { + "mastery_model": exercise_data.get("mastery_model", exercises.M_OF_N), + "randomize": exercise_data.get("randomize", True), + } + ) - super(ExerciseNode, self).__init__(source_id, title, license, extra_fields=exercise_data, **kwargs) + super(ExerciseNode, self).__init__( + source_id, title, license, extra_fields=exercise_data, **kwargs + ) def __str__(self): - metadata = "{0} {1}".format(len(self.questions), "question" if len(self.questions) == 1 else "questions") - return "{title} ({kind}): {metadata}".format(title=self.title, kind=self.__class__.__name__, metadata=metadata) + metadata = "{0} {1}".format( + len(self.questions), "question" if len(self.questions) == 1 else "questions" + ) + return "{title} ({kind}): {metadata}".format( + title=self.title, kind=self.__class__.__name__, metadata=metadata + ) def add_question(self, question): - """ add_question: adds question to question list - Args: question to add to list - Returns: None + """add_question: adds question to question list + Args: question to add to list + Returns: None """ self.questions += [question] @@ -903,7 +1140,9 @@ def process_files(self): """Goes through question fields and replaces image strings Returns: content-hash based filenames of all the required image files """ - config.LOGGER.info("\t*** Processing images for exercise: {}".format(self.title)) + config.LOGGER.info( + "\t*** Processing images for exercise: {}".format(self.title) + ) downloaded = super(ExerciseNode, self).process_files() for question in self.questions: downloaded += question.process_question() @@ -914,11 +1153,11 @@ def process_files(self): return downloaded def process_exercise_data(self): - mastery_model = self.extra_fields['mastery_model'] + mastery_model = self.extra_fields["mastery_model"] # Keep original m/n values or other n/m values if specified - m_value = self.extra_fields.get('m') or self.extra_fields.get('n') - n_value = self.extra_fields.get('n') or self.extra_fields.get('m') + m_value = self.extra_fields.get("m") or self.extra_fields.get("n") + n_value = self.extra_fields.get("n") or self.extra_fields.get("m") if m_value: m_value = int(m_value) @@ -942,34 +1181,52 @@ def process_exercise_data(self): elif mastery_model == exercises.SKILL_CHECK: m_value = n_value = 1 - self.extra_fields.update({'m': m_value}) - self.extra_fields.update({'n': n_value}) + self.extra_fields.update({"m": m_value}) + self.extra_fields.update({"n": n_value}) def validate(self): - """ validate: Makes sure exercise is valid - Args: None - Returns: boolean indicating if exercise is valid + """validate: Makes sure exercise is valid + Args: None + Returns: boolean indicating if exercise is valid """ try: self.process_exercise_data() - assert self.kind == content_kinds.EXERCISE, "Assumption Failed: Node should be an exercise" + assert ( + self.kind == content_kinds.EXERCISE + ), "Assumption Failed: Node should be an exercise" # Check if questions are correct - assert any(self.questions), "Assumption Failed: Exercise does not have a question" - assert all([q.validate() for q in self.questions]), "Assumption Failed: Exercise has invalid question" - assert self.extra_fields['mastery_model'] in MASTERY_MODELS, \ - "Assumption Failed: Unrecognized mastery model {}".format(self.extra_fields['mastery_model']) - if self.extra_fields['mastery_model'] == exercises.M_OF_N: - assert 'm' in self.extra_fields and 'n' in self.extra_fields, "Assumption failed: M of N mastery model is missing M and/or N values" - assert isinstance(self.extra_fields['m'], int), "Assumption failed: M must be an integer value" - assert isinstance(self.extra_fields['m'], int), "Assumption failed: N must be an integer value" + assert any( + self.questions + ), "Assumption Failed: Exercise does not have a question" + assert all( + [q.validate() for q in self.questions] + ), "Assumption Failed: Exercise has invalid question" + assert ( + self.extra_fields["mastery_model"] in MASTERY_MODELS + ), "Assumption Failed: Unrecognized mastery model {}".format( + self.extra_fields["mastery_model"] + ) + if self.extra_fields["mastery_model"] == exercises.M_OF_N: + assert ( + "m" in self.extra_fields and "n" in self.extra_fields + ), "Assumption failed: M of N mastery model is missing M and/or N values" + assert isinstance( + self.extra_fields["m"], int + ), "Assumption failed: M must be an integer value" + assert isinstance( + self.extra_fields["m"], int + ), "Assumption failed: N must be an integer value" return super(ExerciseNode, self).validate() except (AssertionError, ValueError) as ae: - raise InvalidNodeException("Invalid node ({}): {} - {}".format(ae.args[0], self.title, self.__dict__)) - + raise InvalidNodeException( + "Invalid node ({}): {} - {}".format( + ae.args[0], self.title, self.__dict__ + ) + ) def truncate_fields(self): for q in self.questions: @@ -977,36 +1234,40 @@ def truncate_fields(self): super(ExerciseNode, self).truncate_fields() + class SlideshowNode(ContentNode): - """ Model representing Slideshows - - Slideshows are sequences of "Slides", a combination of an image and caption. - Slides are shown in a specified sequential order. - - Attributes: - source_id (str): content's original id - title (str): content's title - license (str or ): content's license - author (str): who created the content (optional) - aggregator (str): website or org hosting the content collection but not necessarily the creator or copyright holder (optional) - provider (str): organization that commissioned or is distributing the content (optional) - description (str): description of content (optional) - files ([]): images associated with slides - thumbnail (str): local path or url to thumbnail image (optional) - extra_fields (dict): any additional data needed for node (optional) - domain_ns (str): who is providing the content (e.g. learningequality.org) (optional) + """Model representing Slideshows + + Slideshows are sequences of "Slides", a combination of an image and caption. + Slides are shown in a specified sequential order. + + Attributes: + source_id (str): content's original id + title (str): content's title + license (str or ): content's license + author (str): who created the content (optional) + aggregator (str): website or org hosting the content collection but not necessarily the creator or copyright holder (optional) + provider (str): organization that commissioned or is distributing the content (optional) + description (str): description of content (optional) + files ([]): images associated with slides + thumbnail (str): local path or url to thumbnail image (optional) + extra_fields (dict): any additional data needed for node (optional) + domain_ns (str): who is providing the content (e.g. learningequality.org) (optional) """ + kind = content_kinds.SLIDESHOW def __init__(self, source_id, title, license, slideshow_data=None, **kwargs): if slideshow_data: - extra_fields = {'slideshow_data': slideshow_data} + extra_fields = {"slideshow_data": slideshow_data} else: - extra_fields = {'slideshow_data':[]} + extra_fields = {"slideshow_data": []} # THe Node base class' __init__ method has: # for f in files or []: # self.add_file(f) - super(SlideshowNode, self).__init__(source_id, title, license, extra_fields=extra_fields, **kwargs) + super(SlideshowNode, self).__init__( + source_id, title, license, extra_fields=extra_fields, **kwargs + ) def add_file(self, file_to_add): """ @@ -1016,35 +1277,42 @@ def add_file(self, file_to_add): Returns: None """ from .files import ThumbnailFile, SlideImageFile - assert isinstance(file_to_add, ThumbnailFile) or isinstance(file_to_add, SlideImageFile), "Files being added must be instances of a subclass of File class" + + assert isinstance(file_to_add, ThumbnailFile) or isinstance( + file_to_add, SlideImageFile + ), "Files being added must be instances of a subclass of File class" if file_to_add not in self.files: filename = file_to_add.get_filename() if filename: - checksum, ext = filename.split('.') # .[png|jpg|jpeg] + checksum, ext = filename.split(".") # .[png|jpg|jpeg] else: - raise ValueError('filename not available') + raise ValueError("filename not available") # # Appending to extra_fields is only necessary for SlideImageFile instances if isinstance(file_to_add, SlideImageFile): # # Find the idx of sort_order.next() - slideshow_image_files = [f for f in self.files if isinstance(f,SlideImageFile)] - idx = len(slideshow_image_files) # next available index, assuming added in desired order + slideshow_image_files = [ + f for f in self.files if isinstance(f, SlideImageFile) + ] + idx = len( + slideshow_image_files + ) # next available index, assuming added in desired order # # Add slideshow data to extra_fields['slideshow_data'] (aka manifest) - slideshow_data = self.extra_fields['slideshow_data'] + slideshow_data = self.extra_fields["slideshow_data"] slideshow_data.append( { - 'caption': file_to_add.caption, - 'descriptive_text': file_to_add.descriptive_text, - 'sort_order': idx, - 'checksum': checksum, - 'extension': ext + "caption": file_to_add.caption, + "descriptive_text": file_to_add.descriptive_text, + "sort_order": idx, + "checksum": checksum, + "extension": ext, } ) - self.extra_fields['slideshow_data'] = slideshow_data + self.extra_fields["slideshow_data"] = slideshow_data # # Add node->file link @@ -1053,13 +1321,23 @@ def add_file(self, file_to_add): def validate(self): from .files import SlideImageFile, ThumbnailFile + try: - assert [f for f in self.files if isinstance(f, SlideImageFile)], \ - "Assumption Failed: SlideshowNode must have at least one SlideImageFile file." - assert all([isinstance(f, SlideImageFile) or isinstance(f, ThumbnailFile) for f in self.files]), \ - "Assumption Failed: SlideshowNode files must be of type SlideImageFile or ThumbnailFile." + assert [ + f for f in self.files if isinstance(f, SlideImageFile) + ], "Assumption Failed: SlideshowNode must have at least one SlideImageFile file." + assert all( + [ + isinstance(f, SlideImageFile) or isinstance(f, ThumbnailFile) + for f in self.files + ] + ), "Assumption Failed: SlideshowNode files must be of type SlideImageFile or ThumbnailFile." except AssertionError as ae: - raise InvalidNodeException("Invalid node ({}): {} - {}".format(ae.args[0], self.title, self.__dict__)) + raise InvalidNodeException( + "Invalid node ({}): {} - {}".format( + ae.args[0], self.title, self.__dict__ + ) + ) super(SlideshowNode, self).validate() @@ -1071,11 +1349,12 @@ def __init__(self, *args, **kwargs): kwargs["extra_fields"] = kwargs.get("extra_fields", {}) kwargs["extra_fields"]["options"] = kwargs["extra_fields"].get("options", {}) # TODO: update le-utils version and use a constant value here - kwargs["extra_fields"]["options"].update({'modality': "CUSTOM_NAVIGATION"}) + kwargs["extra_fields"]["options"].update({"modality": "CUSTOM_NAVIGATION"}) super(CustomNavigationNode, self).__init__(*args, **kwargs) def generate_thumbnail(self): from .files import HTMLZipFile, ExtractedHTMLZipThumbnailFile + html5_files = [f for f in self.files if isinstance(f, HTMLZipFile)] if html5_files: html_file = html5_files[0] @@ -1086,18 +1365,29 @@ def generate_thumbnail(self): return None def validate(self): - """ validate: Makes sure Custom Navigation app is valid - Args: None - Returns: boolean indicating if Custom Navigation app is valid + """validate: Makes sure Custom Navigation app is valid + Args: None + Returns: boolean indicating if Custom Navigation app is valid """ from .files import HTMLZipFile + try: - assert self.kind == content_kinds.TOPIC, "Assumption Failed: Node should be a Topic Node" - assert self.questions == [], "Assumption Failed: Custom Navigation should not have questions" - assert any(f for f in self.files if isinstance(f, HTMLZipFile)), "Assumption Failed: Custom Navigation should have at least one html file" + assert ( + self.kind == content_kinds.TOPIC + ), "Assumption Failed: Node should be a Topic Node" + assert ( + self.questions == [] + ), "Assumption Failed: Custom Navigation should not have questions" + assert any( + f for f in self.files if isinstance(f, HTMLZipFile) + ), "Assumption Failed: Custom Navigation should have at least one html file" return super(CustomNavigationNode, self).validate() except AssertionError as ae: - raise InvalidNodeException("Invalid node ({}): {} - {}".format(ae.args[0], self.title, self.__dict__)) + raise InvalidNodeException( + "Invalid node ({}): {} - {}".format( + ae.args[0], self.title, self.__dict__ + ) + ) class CustomNavigationChannelNode(ChannelNode): @@ -1107,21 +1397,30 @@ def __init__(self, *args, **kwargs): kwargs["extra_fields"] = kwargs.get("extra_fields", {}) kwargs["extra_fields"]["options"] = kwargs["extra_fields"].get("options", {}) # TODO: update le-utils version and use a constant value here - kwargs["extra_fields"]["options"].update({'modality': "CUSTOM_NAVIGATION"}) + kwargs["extra_fields"]["options"].update({"modality": "CUSTOM_NAVIGATION"}) super(CustomNavigationChannelNode, self).__init__(*args, **kwargs) def validate(self): - """ validate: Makes sure Custom Navigation app is valid - Args: None - Returns: boolean indicating if Custom Navigation app is valid + """validate: Makes sure Custom Navigation app is valid + Args: None + Returns: boolean indicating if Custom Navigation app is valid """ from .files import HTMLZipFile + try: - assert self.kind == "Channel", "Assumption Failed: Node should be a Topic Node" - assert any(f for f in self.files if isinstance(f, HTMLZipFile)), "Assumption Failed: Custom Navigation should have at least one html file" + assert ( + self.kind == "Channel" + ), "Assumption Failed: Node should be a Topic Node" + assert any( + f for f in self.files if isinstance(f, HTMLZipFile) + ), "Assumption Failed: Custom Navigation should have at least one html file" return super(CustomNavigationChannelNode, self).validate() except AssertionError as ae: - raise InvalidNodeException("Invalid node ({}): {} - {}".format(ae.args[0], self.title, self.__dict__)) + raise InvalidNodeException( + "Invalid node ({}): {} - {}".format( + ae.args[0], self.title, self.__dict__ + ) + ) class PracticeQuizNode(ExerciseNode): @@ -1129,9 +1428,10 @@ class PracticeQuizNode(ExerciseNode): Node class for creating Practice Quizzes that are exercises under the hood but are displayed as Practice Quizzes in Kolibri. """ + def __init__(self, *args, **kwargs): kwargs["exercise_data"] = kwargs.get("exercise_data", {}) kwargs["exercise_data"]["options"] = kwargs["exercise_data"].get("options", {}) # TODO: update le-utils version and use a constant value here - kwargs["exercise_data"]["options"].update({'modality': "QUIZ"}) + kwargs["exercise_data"]["options"].update({"modality": "QUIZ"}) super(PracticeQuizNode, self).__init__(*args, **kwargs) diff --git a/ricecooker/classes/questions.py b/ricecooker/classes/questions.py index 70b13532..2997d1d5 100644 --- a/ricecooker/classes/questions.py +++ b/ricecooker/classes/questions.py @@ -1,43 +1,66 @@ # Question models for exercises - -import uuid -import json +import copy import html +import json import re -import copy import sys -from bs4 import BeautifulSoup +import uuid from functools import partial -from le_utils.constants import content_kinds,file_formats, format_presets, licenses, exercises + +from bs4 import BeautifulSoup +from le_utils.constants import content_kinds +from le_utils.constants import exercises +from le_utils.constants import file_formats +from le_utils.constants import format_presets +from le_utils.constants import licenses + from .. import config -from ..exceptions import UnknownQuestionTypeError, InvalidQuestionException -from .files import _ExerciseImageFile, _ExerciseGraphieFile, _ExerciseBase64ImageFile +from ..exceptions import InvalidQuestionException +from ..exceptions import UnknownQuestionTypeError +from .files import _ExerciseBase64ImageFile +from .files import _ExerciseGraphieFile +from .files import _ExerciseImageFile from ricecooker.utils.encodings import get_base64_encoding -WEB_GRAPHIE_URL_REGEX = r'web\+graphie:(?P[^\)]+)' # match web_graphie:{{path}} -MARKDOWN_IMAGE_REGEX = r'!\[([^\]]+)?\]\(([^\)]+?)\)' # match ![{{smth}}]({{url}}) +WEB_GRAPHIE_URL_REGEX = ( + r"web\+graphie:(?P[^\)]+)" # match web_graphie:{{path}} +) +MARKDOWN_IMAGE_REGEX = r"!\[([^\]]+)?\]\(([^\)]+?)\)" # match ![{{smth}}]({{url}}) class BaseQuestion: - """ Base model representing exercise questions + """Base model representing exercise questions - Questions are used to assess learner's understanding + Questions are used to assess learner's understanding - Attributes: - id (str): question's unique id - question (str): question text - question_type (str): what kind of question is this - answers ([{'answer':str, 'correct':bool}]): answers to question - hints (str or [str]): optional hints on how to answer question - raw_data (str): raw data for perseus file + Attributes: + id (str): question's unique id + question (str): question text + question_type (str): what kind of question is this + answers ([{'answer':str, 'correct':bool}]): answers to question + hints (str or [str]): optional hints on how to answer question + raw_data (str): raw data for perseus file """ - def __init__(self, id, question, question_type, answers=None, hints=None, raw_data="", source_url=None, randomize=False): + + def __init__( + self, + id, + question, + question_type, + answers=None, + hints=None, + raw_data="", + source_url=None, + randomize=False, + ): self.question = question self.question_type = question_type self.files = [] self.answers = answers if answers is not None else [] - self.hints = [] if hints is None else [hints] if isinstance(hints,str) else hints + self.hints = ( + [] if hints is None else [hints] if isinstance(hints, str) else hints + ) self.raw_data = raw_data self.source_id = id self.source_url = source_url @@ -46,18 +69,22 @@ def __init__(self, id, question, question_type, answers=None, hints=None, raw_da def truncate_fields(self): if self.source_url and len(self.source_url) > config.MAX_SOURCE_URL_LENGTH: - config.print_truncate("question_source_url", self.source_id, self.source_url) - self.source_url = self.source_url[:config.MAX_SOURCE_URL_LENGTH] + config.print_truncate( + "question_source_url", self.source_id, self.source_url + ) + self.source_url = self.source_url[: config.MAX_SOURCE_URL_LENGTH] def to_dict(self): - """ to_dict: puts data in format CC expects - Args: None - Returns: dict of node's data + """to_dict: puts data in format CC expects + Args: None + Returns: dict of node's data """ return { "assessment_id": self.id.hex, "type": self.question_type, - "files": [f.to_dict() for f in filter(lambda x: x and x.filename, self.files)], + "files": [ + f.to_dict() for f in filter(lambda x: x and x.filename, self.files) + ], "question": self.question, "hints": json.dumps(self.hints, ensure_ascii=False), "answers": json.dumps(self.answers, ensure_ascii=False), @@ -67,18 +94,18 @@ def to_dict(self): } def create_answer(self, answer, correct=True): - """ create_answer: Put answer in standard format - Args: - answer (str): text of answer - correct (bool): indicates if answer is correct - Returns: dict of formatted answer + """create_answer: Put answer in standard format + Args: + answer (str): text of answer + correct (bool): indicates if answer is correct + Returns: dict of formatted answer """ return {"answer": str(answer), "correct": correct} def process_question(self): - """ process_question: Parse data that needs to have image strings processed - Args: None - Returns: list of all downloaded files + """process_question: Parse data that needs to have image strings processed + Args: None + Returns: list of all downloaded files """ # Process question self.question, question_files = self.set_images(self.question) @@ -88,8 +115,14 @@ def process_question(self): answer_files = [] answer_index = 0 for answer in self.answers: - processed_string, afiles = self.set_images(answer['answer']) - answers.append({"answer": processed_string, "correct": answer['correct'], "order": answer_index}) + processed_string, afiles = self.set_images(answer["answer"]) + answers.append( + { + "answer": processed_string, + "correct": answer["correct"], + "order": answer_index, + } + ) answer_index += 1 answer_files += afiles self.answers = answers @@ -109,11 +142,11 @@ def process_question(self): return [f.filename for f in self.files] def set_images(self, text, parse_html=True): - """ set_images: Replace image strings with downloaded image checksums - Args: - text (str): text to parse for image strings - Returns:string with checksums in place of image strings and - list of files that were downloaded from string + """set_images: Replace image strings with downloaded image checksums + Args: + text (str): text to parse for image strings + Returns:string with checksums in place of image strings and + list of files that were downloaded from string """ # Set up return values and regex file_list = [] @@ -134,14 +167,14 @@ def set_images(self, text, parse_html=True): return processed_string, file_list def parse_html(self, text): - """ parse_html: Properly formats any img tags that might be in content - Args: - text (str): text to parse - Returns: string with properly formatted images + """parse_html: Properly formats any img tags that might be in content + Args: + text (str): text to parse + Returns: string with properly formatted images """ bs = BeautifulSoup(text, "html5lib") file_reg = re.compile(MARKDOWN_IMAGE_REGEX, flags=re.IGNORECASE) - tags = bs.findAll('img') + tags = bs.findAll("img") for tag in tags: # Look for src attribute, remove formatting if added to image @@ -151,7 +184,7 @@ def parse_html(self, text): alt_text = tag.get("alt") or "" tag.replaceWith("![{alt}]({src})".format(alt=alt_text, src=src_text)) - return html.unescape(bs.find('body').renderContents().decode('utf-8')) + return html.unescape(bs.find("body").renderContents().decode("utf-8")) def set_image(self, text): """ @@ -173,7 +206,7 @@ def set_image(self, text): graphie_match = graphie_regex.match(stripped_text) if graphie_match: is_web_plus_graphie = True - graphie_rawpath = graphie_match.groupdict()['rawpath'] + graphie_rawpath = graphie_match.groupdict()["rawpath"] graphie_path = graphie_rawpath.replace("//", "https://") exercise_image_file = _ExerciseGraphieFile(graphie_path) elif get_base64_encoding(stripped_text): @@ -187,25 +220,37 @@ def set_image(self, text): # Process file to make the replacement_str available _filename = exercise_image_file.process_file() # Get `new_text` = the replacement path for the image resource - new_text = exercises.CONTENT_STORAGE_FORMAT.format(exercise_image_file.get_replacement_str()) - if is_web_plus_graphie: # need to put back the `web+graphie:` prefix + new_text = exercises.CONTENT_STORAGE_FORMAT.format( + exercise_image_file.get_replacement_str() + ) + if is_web_plus_graphie: # need to put back the `web+graphie:` prefix new_text = "web+graphie:" + new_text return new_text, [exercise_image_file] def validate(self): - """ validate: Makes sure question is valid - Args: None - Returns: boolean indicating if question is valid + """validate: Makes sure question is valid + Args: None + Returns: boolean indicating if question is valid """ assert self.id is not None, "Assumption Failed: Question must have an id" - assert isinstance(self.question, str) or self.question is None, "Assumption Failed: Question must be a string" - assert isinstance(self.question_type, str), "Assumption Failed: Question type must be a string" - assert isinstance(self.answers, list), "Assumption Failed: Answers must be a list" + assert ( + isinstance(self.question, str) or self.question is None + ), "Assumption Failed: Question must be a string" + assert isinstance( + self.question_type, str + ), "Assumption Failed: Question type must be a string" + assert isinstance( + self.answers, list + ), "Assumption Failed: Answers must be a list" assert isinstance(self.hints, list), "Assumption Failed: Hints must be a list" for a in self.answers: - assert isinstance(a, dict), "Assumption Failed: Answer in answer list is not a dict" + assert isinstance( + a, dict + ), "Assumption Failed: Answer in answer list is not a dict" for h in self.hints: - assert isinstance(h, str), "Assumption Failed: Hint in hints list is not a string" + assert isinstance( + h, str + ), "Assumption Failed: Hint in hints list is not a string" return True @@ -224,21 +269,40 @@ class PerseusQuestion(BaseQuestion): def __init__(self, id, raw_data, source_url=None, **kwargs): raw_data = raw_data if isinstance(raw_data, str) else json.dumps(raw_data) - super(PerseusQuestion, self).__init__(id, "", exercises.PERSEUS_QUESTION, [], [], raw_data, source_url=source_url, **kwargs) + super(PerseusQuestion, self).__init__( + id, + "", + exercises.PERSEUS_QUESTION, + [], + [], + raw_data, + source_url=source_url, + **kwargs + ) def validate(self): - """ validate: Makes sure perseus question is valid - Args: None - Returns: boolean indicating if perseus question is valid + """validate: Makes sure perseus question is valid + Args: None + Returns: boolean indicating if perseus question is valid """ try: - assert self.question == "", "Assumption Failed: Perseus question should not have a question" - assert self.question_type == exercises.PERSEUS_QUESTION, "Assumption Failed: Question should be perseus type" - assert self.answers == [], "Assumption Failed: Answer list should be empty for perseus question" - assert self.hints == [], "Assumption Failed: Hints list should be empty for perseus question" + assert ( + self.question == "" + ), "Assumption Failed: Perseus question should not have a question" + assert ( + self.question_type == exercises.PERSEUS_QUESTION + ), "Assumption Failed: Question should be perseus type" + assert ( + self.answers == [] + ), "Assumption Failed: Answer list should be empty for perseus question" + assert ( + self.hints == [] + ), "Assumption Failed: Hints list should be empty for perseus question" return super(PerseusQuestion, self).validate() except AssertionError as ae: - raise InvalidQuestionException("Invalid question: {0}".format(self.__dict__)) + raise InvalidQuestionException( + "Invalid question: {0}".format(self.__dict__) + ) def process_question(self): """ @@ -253,22 +317,24 @@ def process_question(self): self._recursive_url_find(question_data, image_files) # Process question - if 'question' in question_data and 'images' in question_data['question']: - question_data['question']['images'], qfiles = self.process_image_field(question_data['question']) + if "question" in question_data and "images" in question_data["question"]: + question_data["question"]["images"], qfiles = self.process_image_field( + question_data["question"] + ) image_files += qfiles # Process hints - if 'hints' in question_data: - for hint in question_data['hints']: - if 'images' in hint: - hint['images'], hfiles = self.process_image_field(hint) + if "hints" in question_data: + for hint in question_data["hints"]: + if "images" in hint: + hint["images"], hfiles = self.process_image_field(hint) image_files += hfiles # Process answers - if 'answers' in question_data: - for answer in question_data['answers']: - if 'images' in answer: - answer['images'], afiles = self.process_image_field(answer) + if "answers" in question_data: + for answer in question_data["answers"]: + if "images" in answer: + answer["images"], afiles = self.process_image_field(answer) image_files += afiles # Process raw data @@ -276,7 +342,9 @@ def process_question(self): # Assume no need for special HTML processing for Persues questions # This avoids probelms with questions that contain < and > inequalities # in formulas that get erroneously parsed as HTML tags - self.raw_data, data_files = super(PerseusQuestion, self).set_images(self.raw_data, parse_html=False) + self.raw_data, data_files = super(PerseusQuestion, self).set_images( + self.raw_data, parse_html=False + ) # Combine all files processed self.files = image_files + data_files @@ -284,7 +352,6 @@ def process_question(self): # Return all filenames return [f.filename for f in self.files] - def process_image_field(self, data): """ Process perseus fields like questions and hints, which look like: @@ -308,14 +375,14 @@ def process_image_field(self, data): that are not listed under `images`, so code must handle that case too, see https://github.com/learningequality/ricecooker/issues/178 for details. """ - new_images_dict = copy.deepcopy(data['images']) + new_images_dict = copy.deepcopy(data["images"]) image_files = [] # STEP 1. Compile dict of {old_url-->new_url} image URL replacements image_replacements = {} # STEP 1A. get all images specified in data['images'] - for old_url, image_settings in data['images'].items(): + for old_url, image_settings in data["images"].items(): new_url, new_image_files = self.set_image(old_url) image_files += new_image_files new_images_dict[new_url] = new_images_dict.pop(old_url) @@ -323,7 +390,7 @@ def process_image_field(self, data): # STEP 1B. look for additional `MARKDOWN_IMAGE_REGEX`-like link in `content` attr. img_link_pat = re.compile(MARKDOWN_IMAGE_REGEX, flags=re.IGNORECASE) - img_link_matches = img_link_pat.findall(data['content']) + img_link_matches = img_link_pat.findall(data["content"]) for match in img_link_matches: old_url = match[1] if old_url not in image_replacements.keys(): @@ -333,11 +400,10 @@ def process_image_field(self, data): # Performd content replacent for all URLs in image_replacements for old_url, new_url in image_replacements.items(): - data['content'] = data['content'].replace(old_url, new_url) + data["content"] = data["content"].replace(old_url, new_url) return new_images_dict, image_files - def _recursive_url_find(self, item, image_list): """ Recursively traverses a dictionary-like data structure for Khan Academy @@ -358,9 +424,9 @@ def _recursive_url_find(self, item, image_list): list(map(recursive_fn, item)) elif isinstance(item, dict): - if 'url' in item: - if item['url']: - item['url'], image_file = self.set_image(item['url']) + if "url" in item: + if item["url"]: + item["url"], image_file = self.set_image(item["url"]) image_list += image_file for field, field_data in item.items(): @@ -371,131 +437,197 @@ def _recursive_url_find(self, item, image_list): class MultipleSelectQuestion(BaseQuestion): - """ Model representing multiple select questions - - Multiple select questions have a set of answers for - the learner to select. There can be multiple answers for - a question (e.g. Which of the following are prime numbers? - A. 1, B. 2, C. 3, D. 4) - - Attributes: - id (str): question's unique id - question (str): question text - correct_answers ([str]): list of correct answers - all_answers ([str]): list of all possible answers - hints ([str]): optional hints on how to answer question - images ({key:str, ...}): a dict mapping image placeholder names to path to image + """Model representing multiple select questions + + Multiple select questions have a set of answers for + the learner to select. There can be multiple answers for + a question (e.g. Which of the following are prime numbers? + A. 1, B. 2, C. 3, D. 4) + + Attributes: + id (str): question's unique id + question (str): question text + correct_answers ([str]): list of correct answers + all_answers ([str]): list of all possible answers + hints ([str]): optional hints on how to answer question + images ({key:str, ...}): a dict mapping image placeholder names to path to image """ def __init__(self, id, question, correct_answers, all_answers, **kwargs): # Put answers into standard format set_all_answers = set(all_answers) - all_answers += [answer for answer in correct_answers if answer not in set_all_answers] - answers = [self.create_answer(answer, answer in correct_answers) for answer in all_answers] + all_answers += [ + answer for answer in correct_answers if answer not in set_all_answers + ] + answers = [ + self.create_answer(answer, answer in correct_answers) + for answer in all_answers + ] if len(answers) == 0: - answers = [self.create_answer('No answers provided.')] - config.LOGGER.warning("\tWARNING: Question {id} does not have any answers (set to default)".format(id=id)) - super(MultipleSelectQuestion, self).__init__(id, question, exercises.MULTIPLE_SELECTION, answers, **kwargs) + answers = [self.create_answer("No answers provided.")] + config.LOGGER.warning( + "\tWARNING: Question {id} does not have any answers (set to default)".format( + id=id + ) + ) + super(MultipleSelectQuestion, self).__init__( + id, question, exercises.MULTIPLE_SELECTION, answers, **kwargs + ) def validate(self): - """ validate: Makes sure multiple selection question is valid - Args: None - Returns: boolean indicating if multiple selection question is valid + """validate: Makes sure multiple selection question is valid + Args: None + Returns: boolean indicating if multiple selection question is valid """ try: - assert self.question_type == exercises.MULTIPLE_SELECTION, "Assumption Failed: Question should be multiple selection type" - assert len(self.answers) > 0, "Assumption Failed: Multiple selection question should have answers" + assert ( + self.question_type == exercises.MULTIPLE_SELECTION + ), "Assumption Failed: Question should be multiple selection type" + assert ( + len(self.answers) > 0 + ), "Assumption Failed: Multiple selection question should have answers" for a in self.answers: - assert 'answer' in a and isinstance(a['answer'], str), "Assumption Failed: Answer in answer list is not a string" - assert 'correct' in a and isinstance(a['correct'], bool), "Assumption Failed: Correct indicator is not a boolean in answer list" + assert "answer" in a and isinstance( + a["answer"], str + ), "Assumption Failed: Answer in answer list is not a string" + assert "correct" in a and isinstance( + a["correct"], bool + ), "Assumption Failed: Correct indicator is not a boolean in answer list" for h in self.hints: - assert isinstance(h, str), "Assumption Failed: Hint in hint list is not a string" + assert isinstance( + h, str + ), "Assumption Failed: Hint in hint list is not a string" return super(MultipleSelectQuestion, self).validate() except AssertionError as ae: - raise InvalidQuestionException("Invalid question: {0}".format(self.__dict__)) + raise InvalidQuestionException( + "Invalid question: {0}".format(self.__dict__) + ) class SingleSelectQuestion(BaseQuestion): - """ Model representing single select questions - - Single select questions have a set of answers for - with only one correct answer. (e.g. How many degrees are in a right angle? - A. 45, B. 90, C. 180, D. None of the above) - - Attributes: - id (str): question's unique id - question (str): question text - correct_answer (str): correct answer - all_answers ([str]): list of all possible answers - hints ([str]): optional hints on how to answer question + """Model representing single select questions + + Single select questions have a set of answers for + with only one correct answer. (e.g. How many degrees are in a right angle? + A. 45, B. 90, C. 180, D. None of the above) + + Attributes: + id (str): question's unique id + question (str): question text + correct_answer (str): correct answer + all_answers ([str]): list of all possible answers + hints ([str]): optional hints on how to answer question """ + def __init__(self, id, question, correct_answer, all_answers, **kwargs): # Put answers into standard format if correct_answer not in all_answers: all_answers += [correct_answer] - answers = [self.create_answer(answer, answer==correct_answer) for answer in all_answers] + answers = [ + self.create_answer(answer, answer == correct_answer) + for answer in all_answers + ] if len(answers) == 0: - answers = [self.create_answer('No answers provided.')] - config.LOGGER.warning("\tWARNING: Question {id} does not have any answers (set to default)".format(id=id)) - super(SingleSelectQuestion, self).__init__(id, question, exercises.SINGLE_SELECTION, answers, **kwargs) + answers = [self.create_answer("No answers provided.")] + config.LOGGER.warning( + "\tWARNING: Question {id} does not have any answers (set to default)".format( + id=id + ) + ) + super(SingleSelectQuestion, self).__init__( + id, question, exercises.SINGLE_SELECTION, answers, **kwargs + ) def validate(self): - """ validate: Makes sure single selection question is valid - Args: None - Returns: boolean indicating if single selection question is valid + """validate: Makes sure single selection question is valid + Args: None + Returns: boolean indicating if single selection question is valid """ try: - assert self.question_type == exercises.SINGLE_SELECTION, "Assumption Failed: Question should be single selection type" - assert len(self.answers) > 0, "Assumption Failed: Multiple selection question should have answers" + assert ( + self.question_type == exercises.SINGLE_SELECTION + ), "Assumption Failed: Question should be single selection type" + assert ( + len(self.answers) > 0 + ), "Assumption Failed: Multiple selection question should have answers" correct_answers = 0 for a in self.answers: - assert 'answer' in a and isinstance(a['answer'], str), "Assumption Failed: Answer in answer list is not a string" - assert 'correct' in a and isinstance(a['correct'], bool), "Assumption Failed: Correct indicator is not a boolean in answer list" - correct_answers += 1 if a['correct'] else 0 - assert correct_answers == 1, "Assumption Failed: Single selection question should have only one correct answer" + assert "answer" in a and isinstance( + a["answer"], str + ), "Assumption Failed: Answer in answer list is not a string" + assert "correct" in a and isinstance( + a["correct"], bool + ), "Assumption Failed: Correct indicator is not a boolean in answer list" + correct_answers += 1 if a["correct"] else 0 + assert ( + correct_answers == 1 + ), "Assumption Failed: Single selection question should have only one correct answer" for h in self.hints: - assert isinstance(h, str), "Assumption Failed: Hint in hints list is not a string" + assert isinstance( + h, str + ), "Assumption Failed: Hint in hints list is not a string" return super(SingleSelectQuestion, self).validate() except AssertionError as ae: - raise InvalidQuestionException("Invalid question: {0}".format(self.__dict__)) + raise InvalidQuestionException( + "Invalid question: {0}".format(self.__dict__) + ) class InputQuestion(BaseQuestion): - """ Model representing input questions + """Model representing input questions - Input questions are questions that have one or more - answers (e.g. Name a factor of 10. ____) + Input questions are questions that have one or more + answers (e.g. Name a factor of 10. ____) - Attributes: - id (str): question's unique id - question (str): question text - answers ([{'answer':str, 'hint':str}]): answers to question - hints ([str]): optional hints on how to answer question - images ({key:str, ...}): a dict mapping image placeholder names to path to image + Attributes: + id (str): question's unique id + question (str): question text + answers ([{'answer':str, 'hint':str}]): answers to question + hints ([str]): optional hints on how to answer question + images ({key:str, ...}): a dict mapping image placeholder names to path to image """ + def __init__(self, id, question, answers, **kwargs): answers = [self.create_answer(answer) for answer in answers] if len(answers) == 0: - answers = [self.create_answer('No answers provided.')] - config.LOGGER.warning("\tWARNING: Question {id} does not have any answers (set to default)".format(id=id)) - super(InputQuestion, self).__init__(id, question, exercises.INPUT_QUESTION, answers, **kwargs) + answers = [self.create_answer("No answers provided.")] + config.LOGGER.warning( + "\tWARNING: Question {id} does not have any answers (set to default)".format( + id=id + ) + ) + super(InputQuestion, self).__init__( + id, question, exercises.INPUT_QUESTION, answers, **kwargs + ) def validate(self): - """ validate: Makes sure input question is valid - Args: None - Returns: boolean indicating if input question is valid + """validate: Makes sure input question is valid + Args: None + Returns: boolean indicating if input question is valid """ try: - assert self.question_type == exercises.INPUT_QUESTION, "Assumption Failed: Question should be input answer type" - assert len(self.answers) > 0, "Assumption Failed: Multiple selection question should have answers" + assert ( + self.question_type == exercises.INPUT_QUESTION + ), "Assumption Failed: Question should be input answer type" + assert ( + len(self.answers) > 0 + ), "Assumption Failed: Multiple selection question should have answers" for a in self.answers: - assert 'answer' in a, "Assumption Failed: Answers must have an answer field" + assert ( + "answer" in a + ), "Assumption Failed: Answers must have an answer field" try: - float(a['answer']) + float(a["answer"]) except ValueError: - assert False, "Assumption Failed: Answer {} must be numeric".format(a['answer']) + assert False, "Assumption Failed: Answer {} must be numeric".format( + a["answer"] + ) for h in self.hints: - assert isinstance(h, str), "Assumption Failed: Hint in hints list is not a string" + assert isinstance( + h, str + ), "Assumption Failed: Hint in hints list is not a string" return super(InputQuestion, self).validate() except AssertionError as ae: - raise InvalidQuestionException("Invalid question: {0}".format(self.__dict__)) + raise InvalidQuestionException( + "Invalid question: {0}".format(self.__dict__) + ) diff --git a/ricecooker/cli.py b/ricecooker/cli.py index 01f8623e..b4b46dca 100644 --- a/ricecooker/cli.py +++ b/ricecooker/cli.py @@ -5,11 +5,11 @@ import sys import uuid -CONFIG_DIR = os.path.join(os.path.expanduser('~'), '.ricecooker') -CONFIG_FILE = os.path.join(CONFIG_DIR, 'config.yaml') +CONFIG_DIR = os.path.join(os.path.expanduser("~"), ".ricecooker") +CONFIG_FILE = os.path.join(CONFIG_DIR, "config.yaml") SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) -TEMPLATE_DIR = os.path.join(SCRIPT_DIR, 'templates') +TEMPLATE_DIR = os.path.join(SCRIPT_DIR, "templates") import ricecooker import ricecooker.config as config @@ -17,7 +17,7 @@ from jinja2 import Template import yaml -props = ['token', 'tempdir'] +props = ["token", "tempdir"] jiro_config = {} if os.path.exists(CONFIG_FILE): jiro_config = yaml.full_load(open(CONFIG_FILE)) @@ -26,22 +26,26 @@ def save_config(): global jiro_config os.makedirs(CONFIG_DIR, exist_ok=True) - yaml.dump(jiro_config, open(CONFIG_FILE, 'w')) + yaml.dump(jiro_config, open(CONFIG_FILE, "w")) def get_chef_script(): - if os.path.exists('chef.py'): - return 'chef.py' + if os.path.exists("chef.py"): + return "chef.py" - return 'sushichef.py' + return "sushichef.py.template" def prompt_for_token(remote_name): global jiro_config print("remote_name = {}".format(remote_name)) - remote = jiro_config['remotes'][remote_name] - result = input("Please enter an authentication token for server {} ({}): ".format(remote_name, remote['url'])) - remote['token'] = result + remote = jiro_config["remotes"][remote_name] + result = input( + "Please enter an authentication token for server {} ({}): ".format( + remote_name, remote["url"] + ) + ) + remote["token"] = result save_config() return result @@ -49,39 +53,37 @@ def prompt_for_token(remote_name): def run_ricecooker(cmd, remote_name=None, extra_args=None): """ - Runs ricecooker. Should be run from a directory containing sushichef.py or chef.py. + Runs ricecooker. Should be run from a directory containing sushichef.py.template or chef.py. :param args: Object with passed parameters and default values. :return: """ global jiro_config - cmd_args = [ - sys.executable, - get_chef_script(), - cmd - ] + cmd_args = [sys.executable, get_chef_script(), cmd] cmd_args.extend(extra_args) env = os.environ.copy() - if 'tempdir' in jiro_config: - env['TMPDIR'] = jiro_config['tempdir'].format(CHEFDIR=os.getcwd()) - print("TMPDIR = {}".format(env['TMPDIR'])) + if "tempdir" in jiro_config: + env["TMPDIR"] = jiro_config["tempdir"].format(CHEFDIR=os.getcwd()) + print("TMPDIR = {}".format(env["TMPDIR"])) token = None if remote_name: - if remote_name in jiro_config['remotes']: - remote = jiro_config['remotes'][remote_name] - env['STUDIO_URL'] = remote['url'] - if 'token' in remote and remote['token']: - token = remote['token'] + if remote_name in jiro_config["remotes"]: + remote = jiro_config["remotes"][remote_name] + env["STUDIO_URL"] = remote["url"] + if "token" in remote and remote["token"]: + token = remote["token"] else: token = prompt_for_token(remote_name) - cmd_args.extend(['--token', token]) + cmd_args.extend(["--token", token]) else: - print("ERROR: No remote with name {} found. To see available remotes, run: jiro remote list") + print( + "ERROR: No remote with name {} found. To see available remotes, run: jiro remote list" + ) sys.exit(1) print("Running {}".format(cmd_args)) @@ -90,8 +92,8 @@ def run_ricecooker(cmd, remote_name=None, extra_args=None): def add_default_remote(): global jiro_config - if not 'remotes' in jiro_config: - jiro_config['remotes'] = {'default': {'url': config.DEFAULT_DOMAIN}} + if not "remotes" in jiro_config: + jiro_config["remotes"] = {"default": {"url": config.DEFAULT_DOMAIN}} # Note: we won't prompt for token until trying to upload to remote. save_config() @@ -105,10 +107,7 @@ def add_remote(args, remainder): """ global jiro_config - jiro_config['remotes'][args.name] = { - 'url': args.url, - 'token': args.token - } + jiro_config["remotes"][args.name] = {"url": args.url, "token": args.token} save_config() @@ -120,8 +119,8 @@ def list_remotes(args, remainder): """ global jiro_config - for remote in jiro_config['remotes']: - print("{}: {}".format(remote, jiro_config['remotes'][remote]['url'])) + for remote in jiro_config["remotes"]: + print("{}: {}".format(remote, jiro_config["remotes"][remote]["url"])) def set(args, remainder): @@ -141,12 +140,12 @@ def new_chef(args, remainder): repo_name = "sushi-chef-{}".format(name.lower().replace(" ", "-")) # strip out non-alphanumeric charactesr from class name - chef_name = re.sub('[^A-Za-z0-9]+', '', name.title()) + chef_name = re.sub("[^A-Za-z0-9]+", "", name.title()) arg_dict = { - 'channel_id': uuid.uuid4().hex, - 'channel_name': name, - 'chef_name': chef_name + "channel_id": uuid.uuid4().hex, + "channel_name": name, + "chef_name": chef_name, } cwd = os.getcwd() @@ -159,44 +158,48 @@ def new_chef(args, remainder): os.makedirs(repo_dir, exist_ok=True) os.chdir(repo_dir) - assets_dir = os.path.join(repo_dir, 'assets') + assets_dir = os.path.join(repo_dir, "assets") os.makedirs(assets_dir, exist_ok=True) - chef_filename = os.path.join(repo_dir, 'sushichef.py') + chef_filename = os.path.join(repo_dir, "sushichef.py.template") if not os.path.exists(chef_filename): - template = Template(open(os.path.join(TEMPLATE_DIR, 'sushichef.py')).read()) + template = Template( + open(os.path.join(TEMPLATE_DIR, "sushichef.py.template")).read() + ) output = template.render(**arg_dict) - f = open(chef_filename, 'w') + f = open(chef_filename, "w") f.write(output) f.close() - reqs_filename = os.path.join(repo_dir, 'requirements.txt') + reqs_filename = os.path.join(repo_dir, "requirements.txt") if not os.path.exists(reqs_filename): - f = open(reqs_filename, 'w') + f = open(reqs_filename, "w") f.write("ricecooker>={}".format(ricecooker.__version__)) f.close() def setup_env(args, remainder): cwd = os.getcwd() - venv_dir = os.path.join(cwd, '.venv') + venv_dir = os.path.join(cwd, ".venv") if not os.path.exists(venv_dir): - subprocess.call(['virtualenv', venv_dir]) + subprocess.call(["virtualenv", venv_dir]) - requirements = os.path.join(cwd, 'requirements.txt') - assert os.path.exists(requirements), "No requirements.txt file found, cannot set up Python environment." - subprocess.call(['pip', 'install', '-r', 'requirements.txt']) + requirements = os.path.join(cwd, "requirements.txt") + assert os.path.exists( + requirements + ), "No requirements.txt file found, cannot set up Python environment." + subprocess.call(["pip", "install", "-r", "requirements.txt"]) def fetch(args, remainder): - return run_ricecooker('fetch', extra_args=remainder) + return run_ricecooker("fetch", extra_args=remainder) def prepare(args, remainder): - return run_ricecooker('dryrun', extra_args=remainder) + return run_ricecooker("dryrun", extra_args=remainder) def serve(args, remainder): - return run_ricecooker('uploadchannel', args.destination, extra_args=remainder) + return run_ricecooker("uploadchannel", args.destination, extra_args=remainder) def main(): @@ -205,40 +208,53 @@ def main(): parser = argparse.ArgumentParser() - commands = parser.add_subparsers(title='commands', help='Commands to operate on ricecooker projects') + commands = parser.add_subparsers( + title="commands", help="Commands to operate on ricecooker projects" + ) - set_cmd = commands.add_parser('set') - set_cmd.add_argument('name', nargs='?', help='Property to set. Choices are: %r' % (props,)) - set_cmd.add_argument('value', nargs='?', help='Value as string to set property to.') + set_cmd = commands.add_parser("set") + set_cmd.add_argument( + "name", nargs="?", help="Property to set. Choices are: %r" % (props,) + ) + set_cmd.add_argument("value", nargs="?", help="Value as string to set property to.") set_cmd.set_defaults(func=set) - remote_cmd = commands.add_parser('remote') - remote_cmds = remote_cmd.add_subparsers(title='remotes', description='Commands related to remote server management.') - - add_cmd = remote_cmds.add_parser('add') - add_cmd.add_argument('name', nargs='?', help='Name of upload server.') - add_cmd.add_argument('url', nargs='?', help='URL of server to upload to.') - add_cmd.add_argument('token', nargs='?', help='User authentication token for server.') + remote_cmd = commands.add_parser("remote") + remote_cmds = remote_cmd.add_subparsers( + title="remotes", description="Commands related to remote server management." + ) + + add_cmd = remote_cmds.add_parser("add") + add_cmd.add_argument("name", nargs="?", help="Name of upload server.") + add_cmd.add_argument("url", nargs="?", help="URL of server to upload to.") + add_cmd.add_argument( + "token", nargs="?", help="User authentication token for server." + ) add_cmd.set_defaults(func=add_remote) - list_cmd = remote_cmds.add_parser('list') + list_cmd = remote_cmds.add_parser("list") list_cmd.set_defaults(func=list_remotes) - setup_cmd = commands.add_parser('setup') + setup_cmd = commands.add_parser("setup") setup_cmd.set_defaults(func=setup_env) - new_cmd = commands.add_parser('new') - new_cmd.add_argument('name', nargs='?', help='Name of new chef') + new_cmd = commands.add_parser("new") + new_cmd.add_argument("name", nargs="?", help="Name of new chef") new_cmd.set_defaults(func=new_chef) - fetch_cmd = commands.add_parser('fetch') + fetch_cmd = commands.add_parser("fetch") fetch_cmd.set_defaults(func=fetch) - prepare_cmd = commands.add_parser('prepare') + prepare_cmd = commands.add_parser("prepare") prepare_cmd.set_defaults(func=prepare) - serve_cmd = commands.add_parser('serve') - serve_cmd.add_argument('destination', nargs='?', default="default", help='Name of remote server to upload to.') + serve_cmd = commands.add_parser("serve") + serve_cmd.add_argument( + "destination", + nargs="?", + default="default", + help="Name of remote server to upload to.", + ) serve_cmd.set_defaults(func=serve) # just pass down the remaining args to the command. diff --git a/ricecooker/commands.py b/ricecooker/commands.py index c75be3dc..2424ccae 100644 --- a/ricecooker/commands.py +++ b/ricecooker/commands.py @@ -1,16 +1,18 @@ +import csv import json +import os import random -import requests -from requests.exceptions import HTTPError import sys import webbrowser -import os -import csv +import requests +from requests.exceptions import HTTPError -from . import config, __version__ +from . import __version__ +from . import config from .classes.nodes import ChannelNode -from .managers.progress import RestoreManager, Status +from .managers.progress import RestoreManager +from .managers.progress import Status from .managers.tree import ChannelManager # Fix to support Python 2.x. @@ -33,76 +35,98 @@ def uploadchannel_wrapper(chef, args, options): uploadchannel(chef, **args_and_options) -def uploadchannel(chef, command='uploadchannel', update=False, thumbnails=False, download_attempts=3, resume=False, step=Status.LAST.name, token="#", prompt=False, publish=False, compress=False, stage=False, **kwargs): - """ uploadchannel: Upload channel to Kolibri Studio - Args: - chef (SushiChef subclass): class that implements the construct_channel method - command (str): the action we want to perform in this run - update (bool): indicates whether to re-download files (optional) - thumbnails (bool): indicates whether to automatically derive thumbnails from content (optional) - download_attempts (int): number of times to retry downloading files (optional) - resume (bool): indicates whether to resume last session automatically (optional) - step (str): step to resume process from (optional) - token (str): content server authorization token - prompt (bool): indicates whether to prompt user to open channel when done (optional) - publish (bool): indicates whether to automatically publish channel (optional) - compress (bool): indicates whether to compress larger files (optional) - stage (bool): indicates whether to stage rather than deploy channel (optional) - kwargs (dict): extra keyword args will be passed to construct_channel (optional) - Returns: (str) link to access newly created channel +def uploadchannel( + chef, + command="uploadchannel", + update=False, + thumbnails=False, + download_attempts=3, + resume=False, + step=Status.LAST.name, + token="#", + prompt=False, + publish=False, + compress=False, + stage=False, + **kwargs +): + """uploadchannel: Upload channel to Kolibri Studio + Args: + chef (SushiChef subclass): class that implements the construct_channel method + command (str): the action we want to perform in this run + update (bool): indicates whether to re-download files (optional) + thumbnails (bool): indicates whether to automatically derive thumbnails from content (optional) + download_attempts (int): number of times to retry downloading files (optional) + resume (bool): indicates whether to resume last session automatically (optional) + step (str): step to resume process from (optional) + token (str): content server authorization token + prompt (bool): indicates whether to prompt user to open channel when done (optional) + publish (bool): indicates whether to automatically publish channel (optional) + compress (bool): indicates whether to compress larger files (optional) + stage (bool): indicates whether to stage rather than deploy channel (optional) + kwargs (dict): extra keyword args will be passed to construct_channel (optional) + Returns: (str) link to access newly created channel """ # Set configuration settings config.UPDATE = update - config.COMPRESS = chef.get_setting('compress-videos', False) - config.THUMBNAILS = chef.get_setting('generate-missing-thumbnails', False) + config.COMPRESS = chef.get_setting("compress-videos", False) + config.THUMBNAILS = chef.get_setting("generate-missing-thumbnails", False) config.STAGE = stage config.PUBLISH = publish # Set max retries for downloading - config.DOWNLOAD_SESSION.mount('http://', requests.adapters.HTTPAdapter(max_retries=int(download_attempts))) - config.DOWNLOAD_SESSION.mount('https://', requests.adapters.HTTPAdapter(max_retries=int(download_attempts))) + config.DOWNLOAD_SESSION.mount( + "http://", requests.adapters.HTTPAdapter(max_retries=int(download_attempts)) + ) + config.DOWNLOAD_SESSION.mount( + "https://", requests.adapters.HTTPAdapter(max_retries=int(download_attempts)) + ) # Get domain to upload to config.init_file_mapping_store() - - if not command == 'dryrun': + if not command == "dryrun": # Authenticate user and check current Ricecooker version username, token = authenticate_user(token) config.LOGGER.info("Logged in with username {0}".format(username)) check_version_number() else: - username = '' - token = '' + username = "" + token = "" config.LOGGER.info("\n\n***** Starting channel build process *****\n\n") # Set up progress tracker config.PROGRESS_MANAGER = RestoreManager() - if (not resume or not config.PROGRESS_MANAGER.check_for_session()) and step.upper() != Status.DONE.name: + if ( + not resume or not config.PROGRESS_MANAGER.check_for_session() + ) and step.upper() != Status.DONE.name: config.PROGRESS_MANAGER.init_session() else: - if resume or prompt_yes_or_no('Previous session detected. Would you like to resume your last session?'): + if resume or prompt_yes_or_no( + "Previous session detected. Would you like to resume your last session?" + ): config.LOGGER.info("Resuming your last session...") step = Status.LAST.name if step is None else step - config.PROGRESS_MANAGER = config.PROGRESS_MANAGER.load_progress(step.upper()) + config.PROGRESS_MANAGER = config.PROGRESS_MANAGER.load_progress( + step.upper() + ) else: config.PROGRESS_MANAGER.init_session() - if hasattr(chef, 'download_content'): + if hasattr(chef, "download_content"): chef.download_content() # TODO load csv if exists metadata_dict = chef.load_channel_metadata_from_csv() - # Construct channel if it hasn't been constructed already if config.PROGRESS_MANAGER.get_status_val() <= Status.CONSTRUCT_CHANNEL.value: config.LOGGER.info("Calling construct_channel... ") channel = chef.construct_channel(**kwargs) - if 'sample' in kwargs and kwargs['sample']: - channel = select_sample_nodes(channel, size=kwargs['sample']) + if "sample" in kwargs and kwargs["sample"]: + channel = select_sample_nodes(channel, size=kwargs["sample"]) config.PROGRESS_MANAGER.set_channel(channel) channel = config.PROGRESS_MANAGER.channel @@ -124,8 +148,8 @@ def uploadchannel(chef, command='uploadchannel', update=False, thumbnails=False, chef.save_channel_metadata_as_csv(channel) - if command == 'dryrun': - config.LOGGER.info('Command is dryrun so we are not uploading chanel.') + if command == "dryrun": + config.LOGGER.info("Command is dryrun so we are not uploading chanel.") return # Set download manager in case steps were skipped @@ -157,7 +181,10 @@ def uploadchannel(chef, command='uploadchannel', update=False, thumbnails=False, channel_id = config.PROGRESS_MANAGER.channel_id # Publish tree if flag is set to True - if config.PUBLISH and config.PROGRESS_MANAGER.get_status_val() <= Status.PUBLISH_CHANNEL.value: + if ( + config.PUBLISH + and config.PROGRESS_MANAGER.get_status_val() <= Status.PUBLISH_CHANNEL.value + ): config.LOGGER.info("") config.LOGGER.info("Publishing channel...") publish_tree(tree, channel_id) @@ -165,13 +192,14 @@ def uploadchannel(chef, command='uploadchannel', update=False, thumbnails=False, # Open link on web browser (if specified) and return new link config.LOGGER.info("\n\nDONE: Channel created at {0}\n".format(channel_link)) - if prompt and prompt_yes_or_no('Would you like to open your channel now?'): + if prompt and prompt_yes_or_no("Would you like to open your channel now?"): config.LOGGER.info("Opening channel... ") webbrowser.open_new_tab(channel_link) config.PROGRESS_MANAGER.set_done() return channel_link + def authenticate_user(token): """ This function adds the studio Authorization `token` header to `config.SESSION` @@ -187,32 +215,36 @@ def authenticate_user(token): response = config.SESSION.post(auth_endpoint) response.raise_for_status() user = json.loads(response._content.decode("utf-8")) - return user['username'], token + return user["username"], token except HTTPError: config.LOGGER.error("Studio token rejected by server " + auth_endpoint) sys.exit() + def check_version_number(): - response = config.SESSION.post(config.check_version_url(), data=json.dumps({"version": __version__})) + response = config.SESSION.post( + config.check_version_url(), data=json.dumps({"version": __version__}) + ) response.raise_for_status() - result = json.loads(response._content.decode('utf-8')) - - if result['status'] == 0: - config.LOGGER.info(result['message']) - elif result['status'] == 1: - config.LOGGER.warning(result['message']) - elif result['status'] == 2: - config.LOGGER.error(result['message']) + result = json.loads(response._content.decode("utf-8")) + + if result["status"] == 0: + config.LOGGER.info(result["message"]) + elif result["status"] == 1: + config.LOGGER.warning(result["message"]) + elif result["status"] == 2: + config.LOGGER.error(result["message"]) if not prompt_yes_or_no("Continue anyways?"): sys.exit() else: - config.LOGGER.error(result['message']) + config.LOGGER.error(result["message"]) sys.exit() + def prompt_yes_or_no(message): - """ prompt_yes_or_no: Prompt user to reply with a y/n response - Args: None - Returns: None + """prompt_yes_or_no: Prompt user to reply with a y/n response + Args: None + Returns: None """ user_input = input("{} [y/n]:".format(message)).lower() if user_input.startswith("y"): @@ -224,10 +256,10 @@ def prompt_yes_or_no(message): def create_initial_tree(channel): - """ create_initial_tree: Create initial tree structure - Args: - channel (Channel): channel to construct - Returns: tree manager to run rest of steps + """create_initial_tree: Create initial tree structure + Args: + channel (Channel): channel to construct + Returns: tree manager to run rest of steps """ # Create channel manager with channel data config.LOGGER.info(" Setting up initial channel structure... ") @@ -240,11 +272,12 @@ def create_initial_tree(channel): config.LOGGER.info(" Tree is valid") return tree + def process_tree_files(tree): - """ process_tree_files: Download files from nodes - Args: - tree (ChannelManager): manager to handle communication to Kolibri Studio - Returns: None + """process_tree_files: Download files from nodes + Args: + tree (ChannelManager): manager to handle communication to Kolibri Studio + Returns: None """ # Fill in values necessary for next steps config.LOGGER.info("Processing content...") @@ -252,36 +285,40 @@ def process_tree_files(tree): tree.check_for_files_failed() return files_to_diff, config.FAILED_FILES + def get_file_diff(tree, files_to_diff): - """ get_file_diff: Download files from nodes - Args: - tree (ChannelManager): manager to handle communication to Kolibri Studio - Returns: list of files that are not on Kolibri Studio + """get_file_diff: Download files from nodes + Args: + tree (ChannelManager): manager to handle communication to Kolibri Studio + Returns: list of files that are not on Kolibri Studio """ # Determine which files have not yet been uploaded to the CC server config.LOGGER.info(" Checking if files exist on Kolibri Studio...") file_diff = tree.get_file_diff(files_to_diff) return file_diff + def upload_files(tree, file_diff): - """ upload_files: Upload files to Kolibri Studio - Args: - tree (ChannelManager): manager to handle communication to Kolibri Studio - file_diff ([str]): list of files to upload - Returns: None + """upload_files: Upload files to Kolibri Studio + Args: + tree (ChannelManager): manager to handle communication to Kolibri Studio + file_diff ([str]): list of files to upload + Returns: None """ # Upload new files to CC - config.LOGGER.info(" Uploading {0} new file(s) to Kolibri Studio...".format(len(file_diff))) + config.LOGGER.info( + " Uploading {0} new file(s) to Kolibri Studio...".format(len(file_diff)) + ) tree.upload_files(file_diff) tree.reattempt_upload_fails() return file_diff def create_tree(tree): - """ create_tree: Upload tree to Kolibri Studio - Args: - tree (ChannelManager): manager to handle communication to Kolibri Studio - Returns: channel id of created channel and link to channel + """create_tree: Upload tree to Kolibri Studio + Args: + tree (ChannelManager): manager to handle communication to Kolibri Studio + Returns: channel id of created channel and link to channel """ # Create tree config.LOGGER.info("Creating tree on Kolibri Studio...") @@ -290,11 +327,11 @@ def create_tree(tree): def publish_tree(tree, channel_id): - """ publish_tree: Publish tree to Kolibri - Args: - tree (ChannelManager): manager to handle communication to Kolibri Studio - channel_id (str): id of channel to publish - Returns: None + """publish_tree: Publish tree to Kolibri + Args: + tree (ChannelManager): manager to handle communication to Kolibri Studio + channel_id (str): id of channel to publish + Returns: None """ config.LOGGER.info("Publishing tree to Kolibri... ") tree.publish(channel_id) @@ -305,10 +342,11 @@ def select_sample_nodes(channel, size=10, seed=42): Build a sample tree of `size` leaf nodes from the channel `channel` to use for debugging chef functionality without uploading the whole tree. """ - config.LOGGER.info('Selecting a sample of size ' + str(size)) + config.LOGGER.info("Selecting a sample of size " + str(size)) # Step 1. channel to paths - node_paths = [] # list of tuples of the form (topic1, topic2, leafnode) + node_paths = [] # list of tuples of the form (topic1, topic2, leafnode) + def walk_tree(parents_path, subtree): for child in subtree.children: child_path = parents_path + (child,) @@ -318,6 +356,7 @@ def walk_tree(parents_path, subtree): else: # emit leaf node node_paths.append(child_path) + walk_tree((), channel) # Step 2. sample paths @@ -331,12 +370,13 @@ def walk_tree(parents_path, subtree): # Step 3. paths to channel_sample channel_sample = ChannelNode( source_domain=channel.source_domain, - source_id=channel.source_id+'-sample', - title='Sample from ' + channel.title, + source_id=channel.source_id + "-sample", + title="Sample from " + channel.title, thumbnail=channel.thumbnail, language=channel.language, - description='Sample from ' + channel.description + description="Sample from " + channel.description, ) + def attach(parent, node_path): if len(node_path) == 1: # leaf node @@ -346,6 +386,7 @@ def attach(parent, node_path): if not any(c.source_id == child.source_id for c in parent.children): parent.add_child(child) attach(child, node_path[1:]) + for node_path in sample_paths: attach(channel_sample, node_path) diff --git a/ricecooker/config.py b/ricecooker/config.py index c2594d41..2f139730 100644 --- a/ricecooker/config.py +++ b/ricecooker/config.py @@ -5,12 +5,12 @@ import hashlib import logging.config import os -import requests -from requests_file import FileAdapter import shutil import socket import tempfile +import requests +from requests_file import FileAdapter UPDATE = False @@ -98,12 +98,12 @@ def setup_logging(level=logging.INFO, main_log=None, error_log=None, add_loggers } config = { - 'version': 1, - 'disable_existing_loggers': False, - 'formatters': { - 'colored': { - '()': 'colorlog.ColoredFormatter', - 'format': "%(log_color)s%(levelname)-8s%(reset)s %(blue)s%(message)s" + "version": 1, + "disable_existing_loggers": False, + "formatters": { + "colored": { + "()": "colorlog.ColoredFormatter", + "format": "%(log_color)s%(levelname)-8s%(reset)s %(blue)s%(message)s", }, "simple_date": { "format": "%(levelname)-8s %(asctime)s %(name)s - %(message)s", @@ -139,16 +139,16 @@ def setup_logging(level=logging.INFO, main_log=None, error_log=None, add_loggers # Domain and file store location for uploading to production Studio server DEFAULT_DOMAIN = "https://api.studio.learningequality.org" -DOMAIN_ENV = os.getenv('STUDIO_URL', None) +DOMAIN_ENV = os.getenv("STUDIO_URL", None) if DOMAIN_ENV is None: # check old ENV varable for backward compatibility - DOMAIN_ENV = os.getenv('CONTENTWORKSHOP_URL', None) + DOMAIN_ENV = os.getenv("CONTENTWORKSHOP_URL", None) DOMAIN = DOMAIN_ENV if DOMAIN_ENV else DEFAULT_DOMAIN -if DOMAIN.endswith('/'): - DOMAIN = DOMAIN.rstrip('/') -FILE_STORE_LOCATION = hashlib.md5(DOMAIN.encode('utf-8')).hexdigest() +if DOMAIN.endswith("/"): + DOMAIN = DOMAIN.rstrip("/") +FILE_STORE_LOCATION = hashlib.md5(DOMAIN.encode("utf-8")).hexdigest() # Allow users to choose which phantomjs they use -PHANTOMJS_PATH = os.getenv('PHANTOMJS_PATH', None) +PHANTOMJS_PATH = os.getenv("PHANTOMJS_PATH", None) # URL for authenticating user on Kolibri Studio AUTHENTICATION_URL = "{domain}/api/internal/authenticate_user_internal" @@ -202,27 +202,32 @@ def setup_logging(level=logging.INFO, main_log=None, error_log=None, add_loggers # Session for downloading files DOWNLOAD_SESSION = requests.Session() -DOWNLOAD_SESSION.mount('file://', FileAdapter()) +DOWNLOAD_SESSION.mount("file://", FileAdapter()) # Environment variable indicating we should use a proxy for youtube_dl downloads USEPROXY = False -USEPROXY = True if os.getenv('USEPROXY') is not None or os.getenv('PROXY_LIST') is not None else False +USEPROXY = ( + True + if os.getenv("USEPROXY") is not None or os.getenv("PROXY_LIST") is not None + else False +) # CSV headers CSV_HEADERS = [ - 'Source ID', - 'Topic Structure', - 'Old Title', - 'New Title', - 'Old Description', - 'New Description', - 'Old Tags', - 'New Tags', - 'Last Modified' + "Source ID", + "Topic Structure", + "Old Title", + "New Title", + "Old Description", + "New Description", + "Old Tags", + "New Tags", + "Last Modified", ] # Automatic temporary direcotry cleanup -chef_temp_dir = os.path.join(os.getcwd(), '.ricecooker-temp') +chef_temp_dir = os.path.join(os.getcwd(), ".ricecooker-temp") + @atexit.register def delete_temp_dir(): @@ -230,6 +235,7 @@ def delete_temp_dir(): LOGGER.debug("Deleting chef temp files at {}".format(chef_temp_dir)) shutil.rmtree(chef_temp_dir) + # While in most cases a chef run will clean up after itself, make sure that if it didn't, # temp files from the old run are deleted so that they do not accumulate. delete_temp_dir() @@ -244,22 +250,21 @@ def delete_temp_dir(): # Record data about past chef runs in chefdata/ dir -DATA_DIR = 'chefdata' -DATA_FILENAME = 'chef_data.json' +DATA_DIR = "chefdata" +DATA_FILENAME = "chef_data.json" DATA_PATH = os.path.join(DATA_DIR, DATA_FILENAME) CHEF_DATA_DEFAULT = { - 'current_run': None, - 'runs': [], - 'tree_archives': { - 'previous': None, - 'current': None - } + "current_run": None, + "runs": [], + "tree_archives": {"previous": None, "current": None}, } -TREES_DATA_DIR = os.path.join(DATA_DIR, 'trees') +TREES_DATA_DIR = os.path.join(DATA_DIR, "trees") # Character limits based on Kolibri models -TRUNCATE_MSG = "\t\t{kind} {id}: {field} {value} is too long - max {max} characters (truncating)" +TRUNCATE_MSG = ( + "\t\t{kind} {id}: {field} {value} is too long - max {max} characters (truncating)" +) MAX_TITLE_LENGTH = 200 MAX_SOURCE_ID_LENGTH = 200 @@ -274,77 +279,66 @@ def delete_temp_dir(): MAX_COPYRIGHT_HOLDER_LENGTH = 200 MAX_CHAR_LIMITS = { - "title": { - "kind": "Node", - "field": "title", - "max": MAX_TITLE_LENGTH - }, - "source_id": { - "kind": "Node", - "field": "source_id", - "max": MAX_SOURCE_ID_LENGTH - }, + "title": {"kind": "Node", "field": "title", "max": MAX_TITLE_LENGTH}, + "source_id": {"kind": "Node", "field": "source_id", "max": MAX_SOURCE_ID_LENGTH}, "description": { "kind": "Node", "field": "description", - "max": MAX_DESCRIPTION_LENGTH - }, - "tagline": { - "kind": "Channel", - "field": "tagline", - "max": MAX_TAGLINE_LENGTH - }, - "author": { - "kind": "Node", - "field": "author", - "max": MAX_AUTHOR_LENGTH + "max": MAX_DESCRIPTION_LENGTH, }, + "tagline": {"kind": "Channel", "field": "tagline", "max": MAX_TAGLINE_LENGTH}, + "author": {"kind": "Node", "field": "author", "max": MAX_AUTHOR_LENGTH}, "question_source_url": { "kind": "Question", "field": "source url", - "max": MAX_SOURCE_URL_LENGTH + "max": MAX_SOURCE_URL_LENGTH, }, "original_filename": { "kind": "File", "field": "original filename", - "max": MAX_ORIGINAL_FILENAME_LENGTH + "max": MAX_ORIGINAL_FILENAME_LENGTH, }, "file_source_url": { "kind": "File", "field": "source url", - "max": MAX_SOURCE_URL_LENGTH + "max": MAX_SOURCE_URL_LENGTH, }, "license_description": { "kind": "License", "field": "license description", - "max": MAX_LICENSE_DESCRIPTION_LENGTH + "max": MAX_LICENSE_DESCRIPTION_LENGTH, }, "copyright_holder": { "kind": "License", "field": "copyright holder", - "max": MAX_COPYRIGHT_HOLDER_LENGTH - }, - "provider": { - "kind": "Provider", - "field": "provider", - "max": MAX_PROVIDER_LENGTH + "max": MAX_COPYRIGHT_HOLDER_LENGTH, }, + "provider": {"kind": "Provider", "field": "provider", "max": MAX_PROVIDER_LENGTH}, "aggregator": { "kind": "Aggregator", "field": "aggregator", - "max": MAX_AGGREGATOR_LENGTH + "max": MAX_AGGREGATOR_LENGTH, }, } def print_truncate(field, id, value, kind=None): limit = MAX_CHAR_LIMITS.get(field) - LOGGER.warning(TRUNCATE_MSG.format(kind=kind or limit["kind"], id=id, field=limit["field"], value=value, max=limit["max"])) + LOGGER.warning( + TRUNCATE_MSG.format( + kind=kind or limit["kind"], + id=id, + field=limit["field"], + value=value, + max=limit["max"], + ) + ) + def get_storage_path(filename): - """ get_storage_path: returns path to storage directory for downloading content - Args: filename (str): Name of file to store - Returns: string path to file + """get_storage_path: returns path to storage directory for downloading content + Args: filename (str): Name of file to store + Returns: string path to file """ directory = os.path.join(STORAGE_DIRECTORY, filename[0], filename[1]) # Make storage directory for downloaded files if it doesn't already exist @@ -352,111 +346,131 @@ def get_storage_path(filename): os.makedirs(directory) return os.path.join(directory, filename) + def authentication_url(): - """ authentication_url: returns url to login to Kolibri Studio - Args: None - Returns: string url to authenticate_user_internal endpoint + """authentication_url: returns url to login to Kolibri Studio + Args: None + Returns: string url to authenticate_user_internal endpoint """ return AUTHENTICATION_URL.format(domain=DOMAIN) + def init_file_mapping_store(): - """ init_file_mapping_store: creates log to keep track of downloaded files - Args: None - Returns: None + """init_file_mapping_store: creates log to keep track of downloaded files + Args: None + Returns: None """ # Make storage directory for restore files if it doesn't already exist path = os.path.join(RESTORE_DIRECTORY, FILE_STORE_LOCATION) if not os.path.exists(path): os.makedirs(path) + def get_restore_path(filename): - """ get_restore_path: returns path to directory for restoration points - Args: - filename (str): Name of file to store - Returns: string path to file + """get_restore_path: returns path to directory for restoration points + Args: + filename (str): Name of file to store + Returns: string path to file """ path = os.path.join(RESTORE_DIRECTORY, FILE_STORE_LOCATION) if not os.path.exists(path): os.makedirs(path) - return os.path.join(path, filename + '.pickle') + return os.path.join(path, filename + ".pickle") def check_version_url(): - """ check_version_url: returns url to check ricecooker version - Args: None - Returns: string url to check version endpoint + """check_version_url: returns url to check ricecooker version + Args: None + Returns: string url to check version endpoint """ return VERSION_CHECK_URL.format(domain=DOMAIN) def file_diff_url(): - """ file_diff_url: returns url to get file diff - Args: None - Returns: string url to file_diff endpoint + """file_diff_url: returns url to get file diff + Args: None + Returns: string url to file_diff endpoint """ return FILE_DIFF_URL.format(domain=DOMAIN) + def file_upload_url(): - """ file_upload_url: returns url to upload files - Args: None - Returns: string url to file_upload endpoint + """file_upload_url: returns url to upload files + Args: None + Returns: string url to file_upload endpoint """ return FILE_UPLOAD_URL.format(domain=DOMAIN) + def get_upload_url(): - """ file_upload_url: returns url to upload files - Args: None - Returns: string url to file_upload endpoint + """file_upload_url: returns url to upload files + Args: None + Returns: string url to file_upload endpoint """ return GET_UPLOAD_URL.format(domain=DOMAIN) + def get_storage_url(filename): - """ get_storage_url: returns the URL for a given file on the storage service - Args: filename (str): Name of file - Returns: string URL for file + """get_storage_url: returns the URL for a given file on the storage service + Args: filename (str): Name of file + Returns: string URL for file """ - return FILE_STORAGE_URL.format(domain=DOMAIN, f=filename[0], s=filename[1], filename=filename) + return FILE_STORAGE_URL.format( + domain=DOMAIN, f=filename[0], s=filename[1], filename=filename + ) + def create_channel_url(): - """ create_channel_url: returns url to create channel - Args: None - Returns: string url to create_channel endpoint + """create_channel_url: returns url to create channel + Args: None + Returns: string url to create_channel endpoint """ return CREATE_CHANNEL_URL.format(domain=DOMAIN) + def add_nodes_url(): - """ add_nodes_url: returns url to add nodes to channel - Args: None - Returns: string url to add_nodes endpoint + """add_nodes_url: returns url to add nodes to channel + Args: None + Returns: string url to add_nodes endpoint """ return ADD_NODES_URL.format(domain=DOMAIN) + def add_nodes_from_file_url(): - """ add_nodes_from_file_url: returns url to add nodes to channel using json file - Args: None - Returns: string url to add_nodes endpoint + """add_nodes_from_file_url: returns url to add nodes to channel using json file + Args: None + Returns: string url to add_nodes endpoint """ return ADD_NODES_FROM_FILE_URL.format(domain=DOMAIN) + def finish_channel_url(): - """ finish_channel_url: returns url to finish uploading a channel - Args: None - Returns: string url to finish_channel endpoint + """finish_channel_url: returns url to finish uploading a channel + Args: None + Returns: string url to finish_channel endpoint """ return FINISH_CHANNEL_URL.format(domain=DOMAIN) + def open_channel_url(channel, staging=False): - """ open_channel_url: returns url to uploaded channel - Args: - channel (str): channel id of uploaded channel - Returns: string url to open channel + """open_channel_url: returns url to uploaded channel + Args: + channel (str): channel id of uploaded channel + Returns: string url to open channel """ - frontend_domain = DOMAIN.replace("api.", "") # Don't send them to the API domain for preview / review. - return OPEN_CHANNEL_URL.format(domain=frontend_domain, channel_id=channel, access='staging' if staging or STAGE else 'edit') + frontend_domain = DOMAIN.replace( + "api.", "" + ) # Don't send them to the API domain for preview / review. + return OPEN_CHANNEL_URL.format( + domain=frontend_domain, + channel_id=channel, + access="staging" if staging or STAGE else "edit", + ) + def publish_channel_url(): - """ open_channel_url: returns url to publish channel - Args: None - Returns: string url to publish channel + """open_channel_url: returns url to publish channel + Args: None + Returns: string url to publish channel """ return PUBLISH_CHANNEL_URL.format(domain=DOMAIN) diff --git a/ricecooker/exceptions.py b/ricecooker/exceptions.py index 16547c0c..3ef3292b 100644 --- a/ricecooker/exceptions.py +++ b/ricecooker/exceptions.py @@ -1,54 +1,75 @@ # Exceptions that might be raised during tree uploading process + class InvalidCommandException(Exception): """ InvalidCommandException: raised when unrecognized command is entered """ - def __init__(self,*args,**kwargs): - Exception.__init__(self,*args,**kwargs) + + def __init__(self, *args, **kwargs): + Exception.__init__(self, *args, **kwargs) + class InvalidUsageException(Exception): """ InvalidUsageException: raised when command line syntax is invalid """ - def __init__(self,*args,**kwargs): - Exception.__init__(self,*args,**kwargs) + + def __init__(self, *args, **kwargs): + Exception.__init__(self, *args, **kwargs) + class InvalidFormatException(Exception): """ InvalidFormatException: raised when file format is unrecognized """ - def __init__(self,*args,**kwargs): - Exception.__init__(self,*args,**kwargs) + + def __init__(self, *args, **kwargs): + Exception.__init__(self, *args, **kwargs) + class FileNotFoundException(Exception): """ FileNotFoundException: raised when file path is not found """ - def __init__(self,*args,**kwargs): - Exception.__init__(self,*args,**kwargs) + + def __init__(self, *args, **kwargs): + Exception.__init__(self, *args, **kwargs) + class UnknownContentKindError(Exception): """ UnknownContentKindError: raised when content kind is unrecognized """ - def __init__(self,*args,**kwargs): - Exception.__init__(self,*args,**kwargs) + + def __init__(self, *args, **kwargs): + Exception.__init__(self, *args, **kwargs) + class UnknownQuestionTypeError(Exception): """ UnknownQuestionTypeError: raised when question type is unrecognized """ - def __init__(self,*args,**kwargs): - Exception.__init__(self,*args,**kwargs) + + def __init__(self, *args, **kwargs): + Exception.__init__(self, *args, **kwargs) + class UnknownFileTypeError(Exception): """ UnknownFileTypeError: raised when file type is unrecognized """ - def __init__(self,*args,**kwargs): - Exception.__init__(self,*args,**kwargs) + + def __init__(self, *args, **kwargs): + Exception.__init__(self, *args, **kwargs) + class UnknownLicenseError(Exception): """ UnknownLicenseError: raised when license is unrecognized """ - def __init__(self,*args,**kwargs): - Exception.__init__(self,*args,**kwargs) + + def __init__(self, *args, **kwargs): + Exception.__init__(self, *args, **kwargs) + class InvalidNodeException(Exception): """ InvalidNodeException: raised when node is improperly formatted """ - def __init__(self,*args,**kwargs): - Exception.__init__(self,*args,**kwargs) + + def __init__(self, *args, **kwargs): + Exception.__init__(self, *args, **kwargs) + class InvalidQuestionException(Exception): """ InvalidQuestionException: raised when question is improperly formatted """ - def __init__(self,*args,**kwargs): - Exception.__init__(self,*args,**kwargs) + + def __init__(self, *args, **kwargs): + Exception.__init__(self, *args, **kwargs) + def raise_for_invalid_channel(channel): - pass \ No newline at end of file + pass diff --git a/ricecooker/managers/progress.py b/ricecooker/managers/progress.py index 6a0870b3..dce18f8a 100644 --- a/ricecooker/managers/progress.py +++ b/ricecooker/managers/progress.py @@ -1,25 +1,28 @@ -import pickle import os +import pickle import time from enum import Enum + from .. import config + class Status(Enum): - """ Enum containing all statuses Ricecooker can have - - Steps: - INIT: Ricecooker process has been started - CONSTRUCT_CHANNEL: Ricecooker is ready to call sushi chef's construct_channel method - CREATE_TREE: Ricecooker is ready to create relationships between nodes - DOWNLOAD_FILES: Ricecooker is ready to start downloading files - GET_FILE_DIFF: Ricecooker is ready to get file diff from Kolibri Studio - START_UPLOAD: Ricecooker is ready to start uploading files to Kolibri Studio - UPLOADING_FILES: Ricecooker is in the middle of uploading files - UPLOAD_CHANNEL: Ricecooker is ready to upload the channel to Kolibri Studio - PUBLISH_CHANNEL: Ricecooker is ready to publish the channel to Kolibri - DONE: Ricecooker is done - LAST: Place where Ricecooker left off + """Enum containing all statuses Ricecooker can have + + Steps: + INIT: Ricecooker process has been started + CONSTRUCT_CHANNEL: Ricecooker is ready to call sushi chef's construct_channel method + CREATE_TREE: Ricecooker is ready to create relationships between nodes + DOWNLOAD_FILES: Ricecooker is ready to start downloading files + GET_FILE_DIFF: Ricecooker is ready to get file diff from Kolibri Studio + START_UPLOAD: Ricecooker is ready to start uploading files to Kolibri Studio + UPLOADING_FILES: Ricecooker is in the middle of uploading files + UPLOAD_CHANNEL: Ricecooker is ready to upload the channel to Kolibri Studio + PUBLISH_CHANNEL: Ricecooker is ready to publish the channel to Kolibri + DONE: Ricecooker is done + LAST: Place where Ricecooker left off """ + INIT = 0 CONSTRUCT_CHANNEL = 1 CREATE_TREE = 2 @@ -34,20 +37,20 @@ class Status(Enum): class RestoreManager: - """ Manager for handling resuming rice cooking process - - Attributes: - restore_path (str): path to .pickle file to store progress - channel (Channel): channel Ricecooker is creating - tree (ChannelManager): manager Ricecooker is using - files_downloaded ([str]): list of files that have been downloaded - file_mapping ({filename:...}): filenames mapped to metadata - files_failed ([str]): list of files that failed to download - file_diff ([str]): list of files that don't exist on Kolibri Studio - files_uploaded ([str]): list of files that have been successfully uploaded - channel_link (str): link to uploaded channel - channel_id (str): id of channel that has been uploaded - status (str): status of Ricecooker + """Manager for handling resuming rice cooking process + + Attributes: + restore_path (str): path to .pickle file to store progress + channel (Channel): channel Ricecooker is creating + tree (ChannelManager): manager Ricecooker is using + files_downloaded ([str]): list of files that have been downloaded + file_mapping ({filename:...}): filenames mapped to metadata + files_failed ([str]): list of files that failed to download + file_diff ([str]): list of files that don't exist on Kolibri Studio + files_uploaded ([str]): list of files that have been successfully uploaded + channel_link (str): link to uploaded channel + channel_id (str): id of channel that has been uploaded + status (str): status of Ricecooker """ def __init__(self): @@ -64,43 +67,52 @@ def __init__(self): self.timestamp = time.time() def check_for_session(self, status=None): - """ check_for_session: see if session is in progress - Args: - status (str): step to check if last session reached (optional) - Returns: boolean indicating if session exists + """check_for_session: see if session is in progress + Args: + status (str): step to check if last session reached (optional) + Returns: boolean indicating if session exists """ status = Status.LAST if status is None else status - return os.path.isfile(self.get_restore_path(status)) and os.path.getsize(self.get_restore_path(status)) > 0 + return ( + os.path.isfile(self.get_restore_path(status)) + and os.path.getsize(self.get_restore_path(status)) > 0 + ) def get_restore_path(self, status=None): - """ get_restore_path: get path to restoration file - Args: - status (str): step to get restore file (optional) - Returns: string path to restoration file + """get_restore_path: get path to restoration file + Args: + status (str): step to get restore file (optional) + Returns: string path to restoration file """ status = self.get_status() if status is None else status return config.get_restore_path(status.name.lower()) def __record_progress(self, next_step=None): - """ __record_progress: save progress to respective restoration file - Args: None - Returns: None + """__record_progress: save progress to respective restoration file + Args: None + Returns: None """ - with open(self.get_restore_path(Status.LAST), 'wb') as handle, open(self.get_restore_path(), 'wb') as step_handle: + with open(self.get_restore_path(Status.LAST), "wb") as handle, open( + self.get_restore_path(), "wb" + ) as step_handle: pickle.dump(self, handle) pickle.dump(self, step_handle) def load_progress(self, resume_step): - """ load_progress: loads progress from restoration file - Args: resume_step (str): step at which to resume session - Returns: manager with progress from step + """load_progress: loads progress from restoration file + Args: resume_step (str): step at which to resume session + Returns: manager with progress from step """ resume_step = Status[resume_step] progress_path = self.get_restore_path(resume_step) # If progress is corrupted, revert to step before while not self.check_for_session(resume_step): - config.LOGGER.error("Ricecooker has not reached {0} status. Reverting to earlier step...".format(resume_step.name)) + config.LOGGER.error( + "Ricecooker has not reached {0} status. Reverting to earlier step...".format( + resume_step.name + ) + ) # All files are corrupted or absent, restart process if resume_step.value - 1 < 0: self.init_session() @@ -110,7 +122,7 @@ def load_progress(self, resume_step): config.LOGGER.error("Starting from status {0}".format(resume_step.name)) # Load manager - with open(progress_path, 'rb') as handle: + with open(progress_path, "rb") as handle: manager = pickle.load(handle) if isinstance(manager, RestoreManager): return manager @@ -118,23 +130,23 @@ def load_progress(self, resume_step): return self def get_status(self): - """ get_status: retrieves current status of Ricecooker - Args: None - Returns: string status of Ricecooker + """get_status: retrieves current status of Ricecooker + Args: None + Returns: string status of Ricecooker """ return self.status def get_status_val(self): - """ get_status_val: retrieves value of status of Ricecooker - Args: None - Returns: number value of status of Ricecooker + """get_status_val: retrieves value of status of Ricecooker + Args: None + Returns: number value of status of Ricecooker """ return self.status.value def init_session(self): - """ init_session: sets session to beginning status - Args: None - Returns: None + """init_session: sets session to beginning status + Args: None + Returns: None """ # Clear out previous session's restoration files for status in Status: @@ -146,78 +158,80 @@ def init_session(self): self.__record_progress(Status.CONSTRUCT_CHANNEL) def set_channel(self, channel): - """ set_channel: records progress from constructed channel - Args: channel (Channel): channel Ricecooker is creating - Returns: None + """set_channel: records progress from constructed channel + Args: channel (Channel): channel Ricecooker is creating + Returns: None """ self.channel = channel self.__record_progress(Status.CREATE_TREE) def set_tree(self, tree): - """ set_channel: records progress from creating the tree - Args: tree (ChannelManager): manager Ricecooker is using - Returns: None + """set_channel: records progress from creating the tree + Args: tree (ChannelManager): manager Ricecooker is using + Returns: None """ self.tree = tree self.__record_progress(Status.DOWNLOAD_FILES) def set_files(self, files_downloaded, files_failed): - """ set_files: records progress from downloading files - Args: - files_downloaded ([str]): list of files that have been downloaded - files_failed ([str]): list of files that failed to download - Returns: None + """set_files: records progress from downloading files + Args: + files_downloaded ([str]): list of files that have been downloaded + files_failed ([str]): list of files that failed to download + Returns: None """ self.files_downloaded = files_downloaded self.files_failed = files_failed self.__record_progress(Status.GET_FILE_DIFF) def set_diff(self, file_diff): - """ set_diff: records progress from getting file diff - Args: file_diff ([str]): list of files that don't exist on Kolibri Studio - Returns: None + """set_diff: records progress from getting file diff + Args: file_diff ([str]): list of files that don't exist on Kolibri Studio + Returns: None """ self.file_diff = file_diff self.__record_progress(Status.START_UPLOAD) def set_uploading(self, files_uploaded): - """ set_uploading: records progress during uploading files - Args: files_uploaded ([str]): list of files that have been successfully uploaded - Returns: None + """set_uploading: records progress during uploading files + Args: files_uploaded ([str]): list of files that have been successfully uploaded + Returns: None """ self.files_uploaded = files_uploaded self.__record_progress(Status.UPLOADING_FILES) def set_uploaded(self, files_uploaded): - """ set_uploaded: records progress after uploading files - Args: files_uploaded ([str]): list of files that have been successfully uploaded - Returns: None + """set_uploaded: records progress after uploading files + Args: files_uploaded ([str]): list of files that have been successfully uploaded + Returns: None """ self.files_uploaded = files_uploaded self.__record_progress(Status.UPLOAD_CHANNEL) def set_channel_created(self, channel_link, channel_id): - """ set_channel_created: records progress after creating channel on Kolibri Studio - Args: - channel_link (str): link to uploaded channel - channel_id (str): id of channel that has been uploaded - Returns: None + """set_channel_created: records progress after creating channel on Kolibri Studio + Args: + channel_link (str): link to uploaded channel + channel_id (str): id of channel that has been uploaded + Returns: None """ self.channel_link = channel_link self.channel_id = channel_id - self.__record_progress(Status.PUBLISH_CHANNEL if config.PUBLISH else Status.DONE) + self.__record_progress( + Status.PUBLISH_CHANNEL if config.PUBLISH else Status.DONE + ) def set_published(self): - """ set_published: records progress after channel has been published - Args: None - Returns: None + """set_published: records progress after channel has been published + Args: None + Returns: None """ self.__record_progress(Status.DONE) def set_done(self): - """ set_done: records progress after Ricecooker process has been completed - Args: None - Returns: None + """set_done: records progress after Ricecooker process has been completed + Args: None + Returns: None """ self.__record_progress(Status.DONE) diff --git a/ricecooker/managers/tree.py b/ricecooker/managers/tree.py index fcdd6144..1141f154 100644 --- a/ricecooker/managers/tree.py +++ b/ricecooker/managers/tree.py @@ -1,19 +1,20 @@ import codecs import json -import requests import sys +import requests from requests.exceptions import RequestException from .. import config class ChannelManager: - """ Manager for handling channel tree structure and communicating to server + """Manager for handling channel tree structure and communicating to server - Attributes: - channel (Channel): channel that manager is handling + Attributes: + channel (Channel): channel that manager is handling """ + def __init__(self, channel): self.channel = channel # Channel to process self.uploaded_files = [] @@ -22,9 +23,9 @@ def __init__(self, channel): self.file_map = {} def validate(self): - """ validate: checks if tree structure is valid - Args: None - Returns: boolean indicating if tree is valid + """validate: checks if tree structure is valid + Args: None + Returns: boolean indicating if tree is valid """ return self.channel.validate_tree() @@ -37,7 +38,9 @@ def process_tree(self, channel_node): """ file_names = [] self.process_tree_recur(file_names, channel_node) - return [x for x in set(file_names) if x] # Remove any duplicate or None filenames + return [ + x for x in set(file_names) if x + ] # Remove any duplicate or None filenames def process_tree_recur(self, file_names, node): """ @@ -48,57 +51,74 @@ def process_tree_recur(self, file_names, node): """ # Process node's children for child_node in node.children: - self.process_tree_recur(file_names, child_node) # Call children first in case a tiled thumbnail is needed + self.process_tree_recur( + file_names, child_node + ) # Call children first in case a tiled thumbnail is needed file_names.extend(node.process_files()) for node_file in node.files: self.file_map[node_file.get_filename()] = node_file - def check_for_files_failed(self): - """ check_for_files_failed: print any files that failed during download process - Args: None - Returns: None + """check_for_files_failed: print any files that failed during download process + Args: None + Returns: None """ if len(config.FAILED_FILES) > 0: - config.LOGGER.error(" {} file(s) have failed to download".format(len(config.FAILED_FILES))) + config.LOGGER.error( + " {} file(s) have failed to download".format(len(config.FAILED_FILES)) + ) for f in config.FAILED_FILES: - if f.node: # files associated with a a content node - info = "{0} {id}".format(f.node.kind.capitalize(), id=f.node.source_id) + if f.node: # files associated with a a content node + info = "{0} {id}".format( + f.node.kind.capitalize(), id=f.node.source_id + ) elif f.assessment_item: # files associated with an assessment item info = "{0} {id}".format("Question", id=f.assessment_item.source_id) - else: # files not associated with a node or an assessment item + else: # files not associated with a node or an assessment item info = f.__class__.__name__ file_identifier = f.__dict__ - if hasattr(f, 'path') and f.path: + if hasattr(f, "path") and f.path: file_identifier = f.path - elif hasattr(f, 'youtube_url') and f.youtube_url: + elif hasattr(f, "youtube_url") and f.youtube_url: file_identifier = f.youtube_url - config.LOGGER.warning("\t{0}: {id} \n\t {err}".format(info, id=file_identifier, err=f.error)) + config.LOGGER.warning( + "\t{0}: {id} \n\t {err}".format( + info, id=file_identifier, err=f.error + ) + ) else: config.LOGGER.info(" All files were successfully downloaded") def get_file_diff(self, files_to_diff): - """ get_file_diff: retrieves list of files that do not exist on content curation server - Args: None - Returns: list of files that are not on server + """get_file_diff: retrieves list of files that do not exist on content curation server + Args: None + Returns: list of files that are not on server """ file_diff_result = [] - chunks = [files_to_diff[x:x+1000] for x in range(0, len(files_to_diff), 1000)] + chunks = [ + files_to_diff[x : x + 1000] for x in range(0, len(files_to_diff), 1000) + ] file_count = 0 total_count = len(files_to_diff) for chunk in chunks: - response = config.SESSION.post(config.file_diff_url(), data=json.dumps(chunk)) + response = config.SESSION.post( + config.file_diff_url(), data=json.dumps(chunk) + ) response.raise_for_status() file_diff_result += json.loads(response._content.decode("utf-8")) file_count += len(chunk) - config.LOGGER.info("\tGot file diff for {0} out of {1} files".format(file_count, total_count)) + config.LOGGER.info( + "\tGot file diff for {0} out of {1} files".format( + file_count, total_count + ) + ) return file_diff_result def do_file_upload(self, f): - with open(config.get_storage_path(f), 'rb') as file_obj: + with open(config.get_storage_path(f), "rb") as file_obj: file_data = self.file_map[f] data = { "size": file_data.size, @@ -110,41 +130,52 @@ def do_file_upload(self, f): url_response = config.SESSION.post(config.get_upload_url(), data=data) if url_response.status_code == 200: response_data = url_response.json() - upload_url = response_data['uploadURL'] - content_type = response_data['mimetype'] - might_skip = response_data['might_skip'] + upload_url = response_data["uploadURL"] + content_type = response_data["mimetype"] + might_skip = response_data["might_skip"] if might_skip: head_response = config.SESSION.head(config.get_storage_url(f)) if head_response.status_code == 200: return - b64checksum = codecs.encode(codecs.decode(file_data.checksum, 'hex'), 'base64').decode().strip() + b64checksum = ( + codecs.encode(codecs.decode(file_data.checksum, "hex"), "base64") + .decode() + .strip() + ) headers = { - 'Content-Type': content_type, - 'Content-MD5': b64checksum, + "Content-Type": content_type, + "Content-MD5": b64checksum, } - response = config.SESSION.put(upload_url, headers=headers, data=file_obj) + response = config.SESSION.put( + upload_url, headers=headers, data=file_obj + ) if response.status_code == 200: return raise RequestException(response._content.decode("utf-8")) else: raise RequestException(url_response._content.decode("utf-8")) - def upload_files(self, file_list): - """ upload_files: uploads files to server - Args: - file_list (str): list of files to upload - Returns: None + """upload_files: uploads files to server + Args: + file_list (str): list of files to upload + Returns: None """ counter = 0 - files_to_upload = list(set(file_list) - set(self.uploaded_files)) # In case restoring from previous session + files_to_upload = list( + set(file_list) - set(self.uploaded_files) + ) # In case restoring from previous session try: for f in files_to_upload: try: self.do_file_upload(f) self.uploaded_files.append(f) counter += 1 - config.LOGGER.info("\tUploaded {0} ({count}/{total}) ".format(f, count=counter, total=len(files_to_upload))) + config.LOGGER.info( + "\tUploaded {0} ({count}/{total}) ".format( + f, count=counter, total=len(files_to_upload) + ) + ) except Exception as e: config.LOGGER.info(e) self.failed_uploads[f] = str(e) @@ -152,22 +183,25 @@ def upload_files(self, file_list): config.PROGRESS_MANAGER.set_uploading(self.uploaded_files) def reattempt_upload_fails(self): - """ reattempt_upload_fails: uploads failed files to server - Args: None - Returns: None + """reattempt_upload_fails: uploads failed files to server + Args: None + Returns: None """ if len(self.failed_uploads) > 0: - config.LOGGER.info("Reattempting to upload {0} file(s)...".format(len(self.failed_uploads))) + config.LOGGER.info( + "Reattempting to upload {0} file(s)...".format(len(self.failed_uploads)) + ) current_fails = [k for k in self.failed_uploads] self.failed_uploads = {} self.upload_files(current_fails) def upload_tree(self): - """ upload_tree: sends processed channel data to server to create tree - Args: None - Returns: link to uploadedchannel + """upload_tree: sends processed channel data to server to create tree + Args: None + Returns: link to uploadedchannel """ from datetime import datetime + start_time = datetime.now() root, channel_id = self.add_channel() self.node_count_dict = {"upload_count": 0, "total_count": self.channel.count()} @@ -183,7 +217,9 @@ def upload_tree(self): self.check_failed() channel_id, channel_link = self.commit_channel(channel_id) end_time = datetime.now() - config.LOGGER.info("Upload time: {time}s".format(time=(end_time - start_time).total_seconds())) + config.LOGGER.info( + "Upload time: {time}s".format(time=(end_time - start_time).total_seconds()) + ) return channel_id, channel_link def truncate_fields(self, node): @@ -194,45 +230,57 @@ def truncate_fields(self, node): def reattempt_failed(self, failed): for node_id in failed: node = failed[node_id] - config.LOGGER.info("\tReattempting {0}s".format(str(node['node']))) - for f in node['node'].files: + config.LOGGER.info("\tReattempting {0}s".format(str(node["node"]))) + for f in node["node"].files: # Attempt to upload file try: assert f.filename, "File failed to download (cannot be uploaded)" - with open(config.get_storage_path(f.filename), 'rb') as file_obj: - response = config.SESSION.post(config.file_upload_url(), files={'file': file_obj}) + with open(config.get_storage_path(f.filename), "rb") as file_obj: + response = config.SESSION.post( + config.file_upload_url(), files={"file": file_obj} + ) response.raise_for_status() self.uploaded_files.append(f.filename) except AssertionError as ae: config.LOGGER.warning(ae) # Attempt to create node - self.add_nodes(node_id, node['node']) + self.add_nodes(node_id, node["node"]) def check_failed(self, print_warning=True): if len(self.failed_node_builds) > 0: if print_warning: - config.LOGGER.warning("WARNING: The following nodes have one or more descendants that could not be created:") + config.LOGGER.warning( + "WARNING: The following nodes have one or more descendants that could not be created:" + ) for node_id in self.failed_node_builds: node = self.failed_node_builds[node_id] - config.LOGGER.warning("\t{} ({})".format(str(node['node']), node['error'])) + config.LOGGER.warning( + "\t{} ({})".format(str(node["node"]), node["error"]) + ) else: - config.LOGGER.error("Failed to create descendants for {} node(s).".format(len(self.failed_node_builds))) + config.LOGGER.error( + "Failed to create descendants for {} node(s).".format( + len(self.failed_node_builds) + ) + ) return True else: config.LOGGER.info(" All nodes were created successfully.") return False def add_channel(self): - """ add_channel: sends processed channel data to server to create tree - Args: None - Returns: link to uploadedchannel + """add_channel: sends processed channel data to server to create tree + Args: None + Returns: link to uploadedchannel """ config.LOGGER.info(" Creating channel {0}".format(self.channel.title)) self.channel.truncate_fields() payload = { - "channel_data":self.channel.to_dict(), + "channel_data": self.channel.to_dict(), } - response = config.SESSION.post(config.create_channel_url(), data=json.dumps(payload)) + response = config.SESSION.post( + config.create_channel_url(), data=json.dumps(payload) + ) try: response.raise_for_status() except Exception: @@ -240,96 +288,133 @@ def add_channel(self): raise new_channel = json.loads(response._content.decode("utf-8")) - return new_channel['root'], new_channel['channel_id'] + return new_channel["root"], new_channel["channel_id"] def add_nodes(self, root_id, current_node, indent=1): - """ add_nodes: adds processed nodes to tree - Args: - root_id (str): id of parent node on Kolibri Studio - current_node (Node): node to publish children - indent (int): level of indentation for printing - Returns: link to uploadedchannel + """add_nodes: adds processed nodes to tree + Args: + root_id (str): id of parent node on Kolibri Studio + current_node (Node): node to publish children + indent (int): level of indentation for printing + Returns: link to uploadedchannel """ # if the current node has no children, no need to continue if not current_node.children: return - config.LOGGER.info("({count} of {total} uploaded) {indent}Processing {title} ({kind})".format( - count=self.node_count_dict['upload_count'], - total=self.node_count_dict['total_count'], - indent=" " * indent, - title=current_node.title, - kind=current_node.__class__.__name__) + config.LOGGER.info( + "({count} of {total} uploaded) {indent}Processing {title} ({kind})".format( + count=self.node_count_dict["upload_count"], + total=self.node_count_dict["total_count"], + indent=" " * indent, + title=current_node.title, + kind=current_node.__class__.__name__, + ) ) # Send children in chunks to avoid gateway errors try: - chunks = [current_node.children[x:x+10] for x in range(0, len(current_node.children), 10)] + chunks = [ + current_node.children[x : x + 10] + for x in range(0, len(current_node.children), 10) + ] for chunk in chunks: payload_children = [] for child in chunk: - failed = [f for f in child.files if f.is_primary and (not f.filename or self.failed_uploads.get(f.filename))] + failed = [ + f + for f in child.files + if f.is_primary + and (not f.filename or self.failed_uploads.get(f.filename)) + ] if any(failed): if not self.failed_node_builds.get(root_id): error_message = "" for fail in failed: - reason = fail.filename + ": " + self.failed_uploads.get(fail.filename) if fail.filename else "File failed to download" + reason = ( + fail.filename + + ": " + + self.failed_uploads.get(fail.filename) + if fail.filename + else "File failed to download" + ) error_message = error_message + reason + ", " - self.failed_node_builds[root_id] = {'node': current_node, 'error': error_message[:-2]} + self.failed_node_builds[root_id] = { + "node": current_node, + "error": error_message[:-2], + } else: payload_children.append(child.to_dict()) - payload = { - 'root_id': root_id, - 'content_data': payload_children - } + payload = {"root_id": root_id, "content_data": payload_children} # When iceqube is integrated, use this method to utilize upload file optimizations # response = config.SESSION.post(config.add_nodes_from_file_url(), files={'file': json.dumps(payload)}) - response = config.SESSION.post(config.add_nodes_url(), data=json.dumps(payload)) + response = config.SESSION.post( + config.add_nodes_url(), data=json.dumps(payload) + ) if response.status_code != 200: - self.failed_node_builds[root_id] = {'node': current_node, 'error': response.reason} + self.failed_node_builds[root_id] = { + "node": current_node, + "error": response.reason, + } else: response_json = json.loads(response._content.decode("utf-8")) - self.node_count_dict['upload_count'] += len(chunk) + self.node_count_dict["upload_count"] += len(chunk) - if response_json['root_ids'].get(child.get_node_id().hex): + if response_json["root_ids"].get(child.get_node_id().hex): for child in chunk: - self.add_nodes(response_json['root_ids'].get(child.get_node_id().hex), child, indent + 1) + self.add_nodes( + response_json["root_ids"].get(child.get_node_id().hex), + child, + indent + 1, + ) except ConnectionError as ce: - self.failed_node_builds[root_id] = {'node': current_node, 'error': ce} + self.failed_node_builds[root_id] = {"node": current_node, "error": ce} def commit_channel(self, channel_id): - """ commit_channel: commits channel to Kolibri Studio - Args: - channel_id (str): channel's id on Kolibri Studio - Returns: channel id and link to uploadedchannel + """commit_channel: commits channel to Kolibri Studio + Args: + channel_id (str): channel's id on Kolibri Studio + Returns: channel id and link to uploadedchannel """ payload = { - "channel_id":channel_id, + "channel_id": channel_id, "stage": config.STAGE, } - response = config.SESSION.post(config.finish_channel_url(), data=json.dumps(payload)) + response = config.SESSION.post( + config.finish_channel_url(), data=json.dumps(payload) + ) if response.status_code != 200: config.LOGGER.error("") - config.LOGGER.error("Could not activate channel: {}\n".format(response._content.decode('utf-8'))) + config.LOGGER.error( + "Could not activate channel: {}\n".format( + response._content.decode("utf-8") + ) + ) if response.status_code == 403: - config.LOGGER.error("Channel can be viewed at {}\n\n".format(config.open_channel_url(channel_id, staging=True))) + config.LOGGER.error( + "Channel can be viewed at {}\n\n".format( + config.open_channel_url(channel_id, staging=True) + ) + ) sys.exit() response.raise_for_status() new_channel = json.loads(response._content.decode("utf-8")) - channel_link = config.open_channel_url(new_channel['new_channel']) + channel_link = config.open_channel_url(new_channel["new_channel"]) return channel_id, channel_link def publish(self, channel_id): - """ publish: publishes tree to Kolibri - Args: - channel_id (str): channel's id on Kolibri Studio - Returns: None + """publish: publishes tree to Kolibri + Args: + channel_id (str): channel's id on Kolibri Studio + Returns: None """ payload = { - "channel_id":channel_id, + "channel_id": channel_id, } - response = config.SESSION.post(config.publish_channel_url(), data=json.dumps(payload)) + response = config.SESSION.post( + config.publish_channel_url(), data=json.dumps(payload) + ) response.raise_for_status() diff --git a/ricecooker/templates/sushichef.py b/ricecooker/templates/sushichef.py.template similarity index 100% rename from ricecooker/templates/sushichef.py rename to ricecooker/templates/sushichef.py.template diff --git a/ricecooker/utils/browser.py b/ricecooker/utils/browser.py index d2829e8e..cdad5fe4 100644 --- a/ricecooker/utils/browser.py +++ b/ricecooker/utils/browser.py @@ -1,23 +1,27 @@ -import os, urllib, posixpath, webbrowser -from http.server import HTTPServer, BaseHTTPRequestHandler, SimpleHTTPRequestHandler +import os +import posixpath +import urllib +import webbrowser +from http.server import BaseHTTPRequestHandler +from http.server import HTTPServer +from http.server import SimpleHTTPRequestHandler def preview_in_browser(directory, filename="index.html", port=8282): - class RequestHandler(SimpleHTTPRequestHandler): - def translate_path(self, path): # abandon query parameters - path = path.split('?',1)[0] - path = path.split('#',1)[0] + path = path.split("?", 1)[0] + path = path.split("#", 1)[0] path = posixpath.normpath(urllib.parse.unquote(path)) - words = path.split('/') + words = path.split("/") words = filter(None, words) path = directory for word in words: drive, word = os.path.splitdrive(word) head, word = os.path.split(word) - if word in (os.curdir, os.pardir): continue + if word in (os.curdir, os.pardir): + continue path = os.path.join(path, word) return path @@ -25,4 +29,4 @@ def translate_path(self, path): webbrowser.open("http://127.0.0.1:{}/{}".format(port, filename)) - httpd.serve_forever() \ No newline at end of file + httpd.serve_forever() diff --git a/ricecooker/utils/caching.py b/ricecooker/utils/caching.py index a7bbe42d..c5d53ab1 100644 --- a/ricecooker/utils/caching.py +++ b/ricecooker/utils/caching.py @@ -1,32 +1,36 @@ -import requests -import cachecontrol - -from datetime import datetime, timedelta +from datetime import datetime +from datetime import timedelta from email.utils import parsedate +import cachecontrol +import requests from cachecontrol import CacheControlAdapter from cachecontrol.caches.file_cache import FileCache -from cachecontrol.heuristics import BaseHeuristic, expire_after, datetime_to_header +from cachecontrol.heuristics import BaseHeuristic +from cachecontrol.heuristics import datetime_to_header +from cachecontrol.heuristics import expire_after class NeverCache(BaseHeuristic): """ Don't cache the response at all. """ + def update_headers(self, response): - return {'cache-control': 'no-cache'} + return {"cache-control": "no-cache"} class CacheForeverHeuristic(BaseHeuristic): """ Cache the response effectively forever. """ + def update_headers(self, response): headers = {} - expires = expire_after(timedelta(weeks=10*52), date=datetime.now()) - headers['expires'] = datetime_to_header(expires) - headers['cache-control'] = 'public' - + expires = expire_after(timedelta(weeks=10 * 52), date=datetime.now()) + headers["expires"] = datetime_to_header(expires) + headers["cache-control"] = "public" + return headers @@ -39,7 +43,9 @@ class InvalidatingCacheControlAdapter(CacheControlAdapter): def __init__(self, heuristic=None, *args, **kw): if not heuristic: heuristic = NeverCache() - super(InvalidatingCacheControlAdapter, self).__init__(*args, heuristic=heuristic, **kw) + super(InvalidatingCacheControlAdapter, self).__init__( + *args, heuristic=heuristic, **kw + ) def send(self, request, **kw): diff --git a/ricecooker/utils/corrections.py b/ricecooker/utils/corrections.py index 9a92f63a..b3ea192d 100755 --- a/ricecooker/utils/corrections.py +++ b/ricecooker/utils/corrections.py @@ -2,10 +2,11 @@ import argparse import copy import csv -from datetime import datetime -import dictdiffer import json import os +from datetime import datetime + +import dictdiffer import requests from ricecooker.config import LOGGER @@ -13,35 +14,34 @@ # CONFIG CONSTANTS for data directories ################################################################################ -STUDIO_CREDENTIALS='credentials/studio.json' -CHEFDATA_DIR = 'chefdata' -STUDIO_TREES_DIR = os.path.join(CHEFDATA_DIR, 'studiotrees') +STUDIO_CREDENTIALS = "credentials/studio.json" +CHEFDATA_DIR = "chefdata" +STUDIO_TREES_DIR = os.path.join(CHEFDATA_DIR, "studiotrees") if not os.path.exists(STUDIO_TREES_DIR): os.makedirs(STUDIO_TREES_DIR) -CORRECTIONS_DIR = os.path.join(CHEFDATA_DIR, 'corrections') +CORRECTIONS_DIR = os.path.join(CHEFDATA_DIR, "corrections") if not os.path.exists(CORRECTIONS_DIR): os.makedirs(CORRECTIONS_DIR) - # CORRECTIONS STRUCTURE v0.2 ################################################################################ -ACTION_KEY = 'Action' -NODE_ID_KEY = 'Node ID' -CONTENT_ID_KEY = 'Content ID' -PATH_KEY = 'Path' -CONTENT_KIND_KEY = 'Content Kind' -OLD_TITLE_KEY = 'Old Title' -NEW_TITLE_KEY = 'New Title' -OLD_DESCR_KEY = 'Old Description' -NEW_DESCR_KEY = 'New Description' -OLD_TAGS_KEY = 'Old Tags' -NEW_TAGS_KEY = 'New Tags' -OLD_COPYRIGHT_HOLDER_KEY = 'Old Copyright Holder' -NEW_COPYRIGHT_HOLDER_KEY = 'New Copyright Holder' -OLD_AUTHOR_KEY = 'Old Author' -NEW_AUTHOR_KEY = 'New Author' +ACTION_KEY = "Action" +NODE_ID_KEY = "Node ID" +CONTENT_ID_KEY = "Content ID" +PATH_KEY = "Path" +CONTENT_KIND_KEY = "Content Kind" +OLD_TITLE_KEY = "Old Title" +NEW_TITLE_KEY = "New Title" +OLD_DESCR_KEY = "Old Description" +NEW_DESCR_KEY = "New Description" +OLD_TAGS_KEY = "Old Tags" +NEW_TAGS_KEY = "New Tags" +OLD_COPYRIGHT_HOLDER_KEY = "Old Copyright Holder" +NEW_COPYRIGHT_HOLDER_KEY = "New Copyright Holder" +OLD_AUTHOR_KEY = "Old Author" +NEW_AUTHOR_KEY = "New Author" CORRECTIONS_HEADER = [ ACTION_KEY, @@ -64,71 +64,88 @@ # What columns to export metadata to... TARGET_COLUMNS = { - 'title': [OLD_TITLE_KEY, NEW_TITLE_KEY], - 'description': [OLD_DESCR_KEY, NEW_DESCR_KEY], - 'tags': [OLD_TAGS_KEY, NEW_TAGS_KEY], - 'copyright_holder': [OLD_COPYRIGHT_HOLDER_KEY, NEW_COPYRIGHT_HOLDER_KEY], - 'author': [OLD_AUTHOR_KEY, NEW_AUTHOR_KEY], + "title": [OLD_TITLE_KEY, NEW_TITLE_KEY], + "description": [OLD_DESCR_KEY, NEW_DESCR_KEY], + "tags": [OLD_TAGS_KEY, NEW_TAGS_KEY], + "copyright_holder": [OLD_COPYRIGHT_HOLDER_KEY, NEW_COPYRIGHT_HOLDER_KEY], + "author": [OLD_AUTHOR_KEY, NEW_AUTHOR_KEY], } # default_keys = ['node_id', 'content_id'] # 'studio_id', 'source_id'] -default_export = ['title', 'description', 'tags', 'copyright_holder', 'author'] - - - - - +default_export = ["title", "description", "tags", "copyright_holder", "author"] # Studio Tree Local Cache queries ################################################################################ -def get_channel_tree(api, channel_id, suffix='', update=True): + +def get_channel_tree(api, channel_id, suffix="", update=True): """ Downloads the entire main tree of a Studio channel to a local json file. """ - filename = os.path.join(STUDIO_TREES_DIR, channel_id + suffix + '.json') + filename = os.path.join(STUDIO_TREES_DIR, channel_id + suffix + ".json") if os.path.exists(filename) and not update: - print(' Loading cached tree for channel_id=', channel_id, 'from', filename) - channel_tree = json.load(open(filename, 'r')) + print(" Loading cached tree for channel_id=", channel_id, "from", filename) + channel_tree = json.load(open(filename, "r")) return channel_tree else: - print(' Downloading tree for channel_id=', channel_id, ' and saving to', filename) + print( + " Downloading tree for channel_id=", channel_id, " and saving to", filename + ) root_studio_id = api.get_channel_root_studio_id(channel_id) # next step takes long since recursively making O(n) API calls! channel_tree = api.get_tree_for_studio_id(root_studio_id) - json.dump(channel_tree, open(filename, 'w'), indent=4, ensure_ascii=False, sort_keys=True) + json.dump( + channel_tree, + open(filename, "w"), + indent=4, + ensure_ascii=False, + sort_keys=True, + ) return channel_tree - def print_channel_tree(channel_tree): """ Print tree structure. """ - def print_tree(subtree, indent=''): - kind = subtree.get("kind", 'topic') # topic default to handle channel root + + def print_tree(subtree, indent=""): + kind = subtree.get("kind", "topic") # topic default to handle channel root if kind == "exercise": - print(indent, subtree['title'], - 'kind=', subtree['kind'], - len(subtree['assessment_items']), 'questions', - len(subtree['files']), 'files') + print( + indent, + subtree["title"], + "kind=", + subtree["kind"], + len(subtree["assessment_items"]), + "questions", + len(subtree["files"]), + "files", + ) else: - print(indent, subtree['title'], - 'kind=', subtree['kind'], - len(subtree['files']), 'files') - for child in subtree['children']: - print_tree(child, indent=indent+' ') - print_tree(channel_tree) + print( + indent, + subtree["title"], + "kind=", + subtree["kind"], + len(subtree["files"]), + "files", + ) + for child in subtree["children"]: + print_tree(child, indent=indent + " ") + print_tree(channel_tree) # CORECTIONS EXPORT ################################################################################ -class CorretionsCsvFileExporter(object): - def __init__(self, csvfilepath='corrections-export.csv', exportattrs=default_export): +class CorretionsCsvFileExporter(object): + def __init__( + self, csvfilepath="corrections-export.csv", exportattrs=default_export + ): self.csvfilepath = csvfilepath self.exportattrs = exportattrs @@ -136,10 +153,9 @@ def download_channel_tree(self, api, channel_id): """ Downloads a complete studio channel_tree from the Studio API. """ - channel_tree = get_channel_tree(api, channel_id, suffix='-export') + channel_tree = get_channel_tree(api, channel_id, suffix="-export") return channel_tree - # Export CSV metadata from external corrections ############################################################################ @@ -150,28 +166,30 @@ def export_channel_tree_as_corrections_csv(self, channel_tree): """ file_path = self.csvfilepath if os.path.exists(file_path): - print('Overwriting previous export', file_path) - with open(file_path, 'w') as csv_file: + print("Overwriting previous export", file_path) + with open(file_path, "w") as csv_file: csvwriter = csv.DictWriter(csv_file, CORRECTIONS_HEADER) csvwriter.writeheader() def _write_subtree(path_tuple, subtree, is_root=False): # print(' '*len(path_tuple) + ' - ', subtree['title']) - kind = subtree['kind'] + kind = subtree["kind"] # TOPIC ############################################################ - if kind == 'topic': + if kind == "topic": if is_root: - self.write_topic_row_from_studio_dict(path_tuple, subtree, is_root=is_root) - for child in subtree['children']: + self.write_topic_row_from_studio_dict( + path_tuple, subtree, is_root=is_root + ) + for child in subtree["children"]: _write_subtree(path_tuple, child) else: self.write_topic_row_from_studio_dict(path_tuple, subtree) - for child in subtree['children']: - _write_subtree(path_tuple+[subtree['title']], child) + for child in subtree["children"]: + _write_subtree(path_tuple + [subtree["title"]], child) # CONTENT NODES #################################################### - elif kind in ['video', 'audio', 'document', 'html5']: + elif kind in ["video", "audio", "document", "html5"]: self.write_content_row_from_studio_dict(path_tuple, subtree) # EXERCISE NODES ################################################### @@ -181,26 +199,25 @@ def _write_subtree(path_tuple, subtree, is_root=False): # for question_dict in subtree['assessment_items']: # self.write_question_row_from_question_dict(source_id, question_dict) else: - print('>>>>> skipping node', subtree['title']) + print(">>>>> skipping node", subtree["title"]) path_tuple = [] _write_subtree(path_tuple, channel_tree, is_root=True) - def write_common_row_attributes_from_studio_dict(self, row, studio_dict): # 1. IDENTIFIERS - row[NODE_ID_KEY] = studio_dict['node_id'] - row[CONTENT_ID_KEY] = studio_dict['content_id'] + row[NODE_ID_KEY] = studio_dict["node_id"] + row[CONTENT_ID_KEY] = studio_dict["content_id"] # PATH_KEY is set in specific function - row[CONTENT_KIND_KEY] = studio_dict['kind'] + row[CONTENT_KIND_KEY] = studio_dict["kind"] # 2. METADATA for exportattr in self.exportattrs: target_cols = TARGET_COLUMNS[exportattr] for target_col in target_cols: - if exportattr == 'tags': - tags = studio_dict['tags'] - tags_semicolon_separated = ';'.join(tags) + if exportattr == "tags": + tags = studio_dict["tags"] + tags_semicolon_separated = ";".join(tags) row[target_col] = tags_semicolon_separated else: row[target_col] = studio_dict[exportattr] @@ -208,12 +225,16 @@ def write_common_row_attributes_from_studio_dict(self, row, studio_dict): def write_topic_row_from_studio_dict(self, path_tuple, studio_dict, is_root=False): if is_root: return - print('Generating corrections-export.csv rows for path_tuple ', path_tuple, studio_dict['title']) + print( + "Generating corrections-export.csv rows for path_tuple ", + path_tuple, + studio_dict["title"], + ) file_path = self.csvfilepath - with open(file_path, 'a') as csv_file: + with open(file_path, "a") as csv_file: csvwriter = csv.DictWriter(csv_file, CORRECTIONS_HEADER) - title = studio_dict['title'] - path_with_self = '/'.join(path_tuple+[title]) + title = studio_dict["title"] + path_with_self = "/".join(path_tuple + [title]) topic_row = {} self.write_common_row_attributes_from_studio_dict(topic_row, studio_dict) # WRITE TOPIC ROW @@ -222,34 +243,29 @@ def write_topic_row_from_studio_dict(self, path_tuple, studio_dict, is_root=Fals def write_content_row_from_studio_dict(self, path_tuple, studio_dict): file_path = self.csvfilepath - with open(file_path, 'a') as csv_file: + with open(file_path, "a") as csv_file: csvwriter = csv.DictWriter(csv_file, CORRECTIONS_HEADER) row = {} self.write_common_row_attributes_from_studio_dict(row, studio_dict) - title = studio_dict['title'] - row[PATH_KEY] = '/'.join(path_tuple+[title]) + title = studio_dict["title"] + row[PATH_KEY] = "/".join(path_tuple + [title]) # WRITE ROW csvwriter.writerow(row) - - - - - - # CSV CORRECTIONS LOADERS ################################################################################ -def save_gsheet_to_local_csv(gsheet_id, gid, csvfilepath='corrections-import.csv'): - GSHEETS_BASE = 'https://docs.google.com/spreadsheets/d/' - SHEET_CSV_URL = GSHEETS_BASE + gsheet_id + '/export?format=csv&gid=' + gid + +def save_gsheet_to_local_csv(gsheet_id, gid, csvfilepath="corrections-import.csv"): + GSHEETS_BASE = "https://docs.google.com/spreadsheets/d/" + SHEET_CSV_URL = GSHEETS_BASE + gsheet_id + "/export?format=csv&gid=" + gid print(SHEET_CSV_URL) response = requests.get(SHEET_CSV_URL) - csv_data = response.content.decode('utf-8') - with open(csvfilepath, 'w') as csvfile: + csv_data = response.content.decode("utf-8") + with open(csvfilepath, "w") as csvfile: csvfile.write(csv_data) - print('Succesfully saved ' + csvfilepath) + print("Succesfully saved " + csvfilepath) return csvfilepath @@ -259,7 +275,7 @@ def _clean_dict(row): """ row_cleaned = {} for key, val in row.items(): - if val is None or val == '': + if val is None or val == "": row_cleaned[key] = None else: row_cleaned[key] = val.strip() @@ -267,9 +283,9 @@ def _clean_dict(row): def load_corrections_from_csv(csvfilepath): - csv_path = csvfilepath # download_structure_csv() + csv_path = csvfilepath # download_structure_csv() struct_list = [] - with open(csv_path, 'r') as csvfile: + with open(csv_path, "r") as csvfile: reader = csv.DictReader(csvfile, fieldnames=CORRECTIONS_HEADER) next(reader) # Skip Headers row for row in reader: @@ -286,17 +302,17 @@ def get_csv_corrections(csvfilepath): deletions = [] rows = load_corrections_from_csv(csvfilepath) for i, row in enumerate(rows): - if row[ACTION_KEY] == '' or row[ACTION_KEY] == None: - print('Skipping no-action row', i+1) - elif row[ACTION_KEY] == 'modify': + if row[ACTION_KEY] == "" or row[ACTION_KEY] == None: + print("Skipping no-action row", i + 1) + elif row[ACTION_KEY] == "modify": modifications.append(row) - elif row[ACTION_KEY] == 'delete': + elif row[ACTION_KEY] == "delete": deletions.append(row) else: - print('Uknown Action', row[ACTION_KEY]) + print("Uknown Action", row[ACTION_KEY]) return { - 'modifications': modifications, - 'deletions': deletions, + "modifications": modifications, + "deletions": deletions, } @@ -305,15 +321,15 @@ def get_corrections_by_node_id(csvfilepath, modifyattrs): Convert CSV to internal representaiton of corrections as dicts by node_id. """ corrections_by_node_id = { - 'nodes_modified': {}, - 'nodes_added': {}, - 'nodes_deleted': {}, - 'nodes_moved': {}, + "nodes_modified": {}, + "nodes_added": {}, + "nodes_deleted": {}, + "nodes_moved": {}, } csv_corrections = get_csv_corrections(csvfilepath) # CSV rows GROUP BY corrkind # # Modifications - for row in csv_corrections['modifications']: + for row in csv_corrections["modifications"]: node_id = row[NODE_ID_KEY] # print('Found MODIFY row of CSV for node_id', node_id) # @@ -323,37 +339,44 @@ def get_corrections_by_node_id(csvfilepath, modifyattrs): # print('Found MODIFY', attr, 'in row of CSV for node_id', node_id) old_key = TARGET_COLUMNS[attr][0] new_key = TARGET_COLUMNS[attr][1] - if row[new_key] == row[old_key]: # skip if the same + if row[new_key] == row[old_key]: # skip if the same continue else: attributes[attr] = { - 'changed': True, - 'value': row[new_key], - 'old_value': row[old_key], + "changed": True, + "value": row[new_key], + "old_value": row[old_key], } # prepare modifications_dict modifications_dict = { - 'attributes': attributes, + "attributes": attributes, } # add to to corrections_by_node_id - corrections_by_node_id['nodes_modified'][node_id] = modifications_dict + corrections_by_node_id["nodes_modified"][node_id] = modifications_dict # # Deletions - for row in csv_corrections['deletions']: + for row in csv_corrections["deletions"]: node_id = row[NODE_ID_KEY] # print('Found DELETE row in CSV for node_id', node_id) - corrections_by_node_id['nodes_deleted'][node_id] = {'node_id':node_id} + corrections_by_node_id["nodes_deleted"][node_id] = {"node_id": node_id} # # TODO: Additions # TODO: Moves datetimesuffix = datetime.now().strftime("%Y-%m-%d__%H%M") - correctionspath = os.path.join(CORRECTIONS_DIR, 'imported-' + datetimesuffix + '.json') - json.dump(corrections_by_node_id, open(correctionspath, 'w'), indent=4, ensure_ascii=False, sort_keys=True) + correctionspath = os.path.join( + CORRECTIONS_DIR, "imported-" + datetimesuffix + ".json" + ) + json.dump( + corrections_by_node_id, + open(correctionspath, "w"), + indent=4, + ensure_ascii=False, + sort_keys=True, + ) # return correctionspath - # Tree querying API ################################################################################ @@ -365,65 +388,74 @@ def find_nodes_by_attr(subtree, attr, value): results = [] if subtree[attr] == value: results.append(subtree) - if 'children' in subtree: - for child in subtree['children']: + if "children" in subtree: + for child in subtree["children"]: child_restuls = find_nodes_by_attr(child, attr, value) results.extend(child_restuls) return results + def find_nodes_by_content_id(subtree, content_id): - return find_nodes_by_attr(subtree, 'content_id', content_id) + return find_nodes_by_attr(subtree, "content_id", content_id) + def find_nodes_by_node_id(subtree, node_id): - return find_nodes_by_attr(subtree, 'node_id', node_id) + return find_nodes_by_attr(subtree, "node_id", node_id) + def find_nodes_by_original_source_node_id(subtree, original_source_node_id): - return find_nodes_by_attr(subtree, 'original_source_node_id', original_source_node_id) + return find_nodes_by_attr( + subtree, "original_source_node_id", original_source_node_id + ) + def unresolve_children(node): """ Return copy of node with children = list of studio_id references instead of full data. """ - node = copy.deepcopy(node) - if 'children' in node: + node = copy.deepcopy(node) + if "children" in node: new_children = [] - for child in node['children']: - new_children.append(child['id']) - node['children'] = new_children + for child in node["children"]: + new_children.append(child["id"]) + node["children"] = new_children return node - - - - # SPECIAL REMAP NEEDED FOR ALDARYN CORRECTIONS ################################################################################ -def remap_original_source_node_id_to_node_id(channel_tree, corrections_by_original_source_node_id): - ALL_COORECTIONS_KINDS = ['nodes_modified', 'nodes_added', 'nodes_deleted', 'nodes_moved'] + +def remap_original_source_node_id_to_node_id( + channel_tree, corrections_by_original_source_node_id +): + ALL_COORECTIONS_KINDS = [ + "nodes_modified", + "nodes_added", + "nodes_deleted", + "nodes_moved", + ] corrections_by_node_id = {} for correction_kind in ALL_COORECTIONS_KINDS: if correction_kind in corrections_by_original_source_node_id: corrections_by_node_id[correction_kind] = {} corrections_dict = corrections_by_original_source_node_id[correction_kind] for original_source_node_id, correction in corrections_dict.items(): - results = find_nodes_by_original_source_node_id(channel_tree, original_source_node_id) - assert results, 'no match found based on original_source_node_id search' - assert len(results)==1, 'multiple matches found...' + results = find_nodes_by_original_source_node_id( + channel_tree, original_source_node_id + ) + assert results, "no match found based on original_source_node_id search" + assert len(results) == 1, "multiple matches found..." tree_node = results[0] - node_id = tree_node['node_id'] + node_id = tree_node["node_id"] corrections_by_node_id[correction_kind][node_id] = correction return corrections_by_node_id - - - - # CORRECTIONS API CALLS ################################################################################ + def apply_modifications_for_node_id(api, channel_tree, node_id, modifications_dict): """ Given a modification dict of the form, @@ -449,68 +481,75 @@ def apply_modifications_for_node_id(api, channel_tree, node_id, modifications_di """ # print('MODIFYING node_id=', node_id) results = find_nodes_by_node_id(channel_tree, node_id) - assert results, 'no match found based on node_id search' - assert len(results)==1, 'multiple matches found...' + assert results, "no match found based on node_id search" + assert len(results) == 1, "multiple matches found..." tree_node = results[0] - studio_id = tree_node['id'] + studio_id = tree_node["id"] # node_before = unresolve_children(tree_node) node_before = api.get_contentnode(studio_id) # print('node_before', node_before) # PREPARE data for PUT request (starting form copy of old) data = {} - ATTRS_TO_COPY = ['kind', 'id', 'tags', 'prerequisite', 'parent'] + ATTRS_TO_COPY = ["kind", "id", "tags", "prerequisite", "parent"] for attr in ATTRS_TO_COPY: data[attr] = node_before[attr] # - # ADD new_values modified - modifications = modifications_dict['attributes'] + # ADD new_values modified + modifications = modifications_dict["attributes"] for attr, values_diff in modifications.items(): - if values_diff['changed']: + if values_diff["changed"]: current_value = node_before[attr] - expected_old_value = values_diff['old_value'] - new_value = values_diff['value'] - if expected_old_value == new_value: # skip if the same + expected_old_value = values_diff["old_value"] + new_value = values_diff["value"] + if expected_old_value == new_value: # skip if the same continue if current_value != expected_old_value: - print('WARNING expected old value', expected_old_value, 'for', attr, 'but current node value is', current_value) + print( + "WARNING expected old value", + expected_old_value, + "for", + attr, + "but current node value is", + current_value, + ) # print('Changing current_value', current_value, 'for', attr, 'to new value', new_value) data[attr] = new_value else: - print('Skipping attribute', attr, 'because key changed==False') + print("Skipping attribute", attr, "because key changed==False") # PUT - print('PUT studio_id=', studio_id, 'node_id=', node_id) + print("PUT studio_id=", studio_id, "node_id=", node_id) response_data = api.put_contentnode(data) # Check what changed node_after = api.get_contentnode(studio_id) diffs = list(dictdiffer.diff(node_before, node_after)) - print(' diff=', diffs) + print(" diff=", diffs) return response_data def apply_deletion_for_node_id(api, channel_tree, channel_id, node_id, deletion_dict): results = find_nodes_by_node_id(channel_tree, node_id) - assert results, 'no match found based on node_id search' - assert len(results)==1, 'multiple matches found...' + assert results, "no match found based on node_id search" + assert len(results) == 1, "multiple matches found..." tree_node = results[0] - studio_id = tree_node['id'] + studio_id = tree_node["id"] # node_before = unresolve_children(tree_node) node_before = api.get_contentnode(studio_id) - + # PREPARE data for DLETE request data = {} - data['id'] = node_before['id'] + data["id"] = node_before["id"] # DELETE - print('DELETE studio_id=', studio_id, 'node_id=', node_id) + print("DELETE studio_id=", studio_id, "node_id=", node_id) response_data = api.delete_contentnode(data, channel_id) # Check what changed node_after = api.get_contentnode(studio_id) diffs = list(dictdiffer.diff(node_before, node_after)) - print(' diff=', diffs) + print(" diff=", diffs) return response_data @@ -535,53 +574,55 @@ def apply_corrections_by_node_id(api, channel_tree, channel_id, corrections_by_n } this function will make the appropriate Studio API calls to apply the patch. """ - LOGGER.debug('Applying corrections...') + LOGGER.debug("Applying corrections...") # # Modifications - for node_id, modifications_dict in corrections_by_node_id['nodes_modified'].items(): + for node_id, modifications_dict in corrections_by_node_id["nodes_modified"].items(): apply_modifications_for_node_id(api, channel_tree, node_id, modifications_dict) # # Deletions - for node_id, deletion_dict in corrections_by_node_id['nodes_deleted'].items(): - apply_deletion_for_node_id(api, channel_tree, channel_id, node_id, deletion_dict) + for node_id, deletion_dict in corrections_by_node_id["nodes_deleted"].items(): + apply_deletion_for_node_id( + api, channel_tree, channel_id, node_id, deletion_dict + ) # TODO: Additions # TODO: Moves - - - - - from ricecooker.utils.libstudio import StudioApi + def get_studio_api(studio_creds=None): if studio_creds is None: if not os.path.exists(STUDIO_CREDENTIALS): - print('ERROR: Studio credentials file', STUDIO_CREDENTIALS, 'not found') - print("""Please create the file and put the following informaiton in it: + print("ERROR: Studio credentials file", STUDIO_CREDENTIALS, "not found") + print( + """Please create the file and put the following informaiton in it: { "token": "", "username": "", "password": "", } - """) - raise ValueError('Missing credentials') + """ + ) + raise ValueError("Missing credentials") studio_creds = json.load(open(STUDIO_CREDENTIALS)) # # Studio API client (note currently needs both session auth and token as well) api = StudioApi( - token=studio_creds['token'], - username=studio_creds['username'], - password=studio_creds['password'], - studio_url=studio_creds.get('studio_url', 'https://studio.learningequality.org') + token=studio_creds["token"], + username=studio_creds["username"], + password=studio_creds["password"], + studio_url=studio_creds.get( + "studio_url", "https://studio.learningequality.org" + ), ) return api def export_corrections_csv(args): api = get_studio_api() - channel_tree = get_channel_tree(api, args.channel_id, suffix='-export') + channel_tree = get_channel_tree(api, args.channel_id, suffix="-export") print_channel_tree(channel_tree) csvexporter = CorretionsCsvFileExporter() csvexporter.export_channel_tree_as_corrections_csv(channel_tree) @@ -590,69 +631,88 @@ def export_corrections_csv(args): def apply_corrections(args): # 1. LOAD Studio channel_tree (needed for lookups by node_id, content_id, etc.) api = get_studio_api() - channel_tree = get_channel_tree(api, args.channel_id, suffix='-before') + channel_tree = get_channel_tree(api, args.channel_id, suffix="-before") # # 2. IMPORT the corrections from the Spreadsheet - csvfilepath = 'corrections-import.csv' + csvfilepath = "corrections-import.csv" save_gsheet_to_local_csv(args.gsheet_id, args.gid, csvfilepath=csvfilepath) # # 3. TRANSFORM corrections-import.csv to Studio detailed diff format - modifyattrs = args.modifyattrs.split(',') # using only selected attributes + modifyattrs = args.modifyattrs.split(",") # using only selected attributes correctionspath = get_corrections_by_node_id(csvfilepath, modifyattrs) # # Special case: when export was performed on source channel, but we want to # apply the corrections to a cloned channel. In that cases, the `Node ID` # column in the CSV corresponds to the `original_source_node_id` attribute # of the nodes in the derivative channel so we must do a remapping: - if args.primarykey == 'original_source_node_id': + if args.primarykey == "original_source_node_id": corrections_by_original_source_node_id = json.load(open(correctionspath)) - corrections_by_node_id = remap_original_source_node_id_to_node_id(channel_tree, corrections_by_original_source_node_id) - json.dump(corrections_by_node_id, open(correctionspath, 'w'), indent=4, ensure_ascii=False, sort_keys=True) - print('Finished original_source_node_id-->node_id lookup and remapping.') - elif args.primarykey in ['content_id', 'studio_id']: - raise NotImplementedError('Using content_id and studio_id not ready yet.') + corrections_by_node_id = remap_original_source_node_id_to_node_id( + channel_tree, corrections_by_original_source_node_id + ) + json.dump( + corrections_by_node_id, + open(correctionspath, "w"), + indent=4, + ensure_ascii=False, + sort_keys=True, + ) + print("Finished original_source_node_id-->node_id lookup and remapping.") + elif args.primarykey in ["content_id", "studio_id"]: + raise NotImplementedError("Using content_id and studio_id not ready yet.") # # Early exit if running the `importonly` command - if args.command == 'importonly': - print('Corrections json file imported. See', correctionspath) + if args.command == "importonly": + print("Corrections json file imported. See", correctionspath) return correctionspath # # 4. LOAD corrections.json (four lists of corrections organized by nod_id) corrections_by_node_id = json.load(open(correctionspath)) # # 5. Apply the corrections - apply_corrections_by_node_id(api, channel_tree, args.channel_id, corrections_by_node_id) + apply_corrections_by_node_id( + api, channel_tree, args.channel_id, corrections_by_node_id + ) # # 6. SAVE the Studio tree after corrections for review of what was changed - channel_tree = get_channel_tree(api, args.channel_id, suffix='-after') - + channel_tree = get_channel_tree(api, args.channel_id, suffix="-after") def correctionsmain(): """ Command line interface for applying bulk-edit corrections: """ - parser = argparse.ArgumentParser(description='Bulk channel edits via CSV/sheets.') - parser.add_argument('command', help='One of export|importonly|apply', - choices=['export', 'importonly', 'apply']) - parser.add_argument('channel_id', help='The studio Channel ID to edit') - parser.add_argument('--primarykey', help='Which idendifier to use when looking up nodes', - choices=['node_id', 'content_id', 'original_source_node_id', 'studio_id'], - default='node_id') - parser.add_argument('--gsheet_id', help='Google spreadsheets sheet ID (public)') - parser.add_argument('--gid', help='The gid argument to indicate which sheet', default='0') - parser.add_argument('--modifyattrs', help='Which attributes to modify', - default='title,description,author,copyright_holder') + parser = argparse.ArgumentParser(description="Bulk channel edits via CSV/sheets.") + parser.add_argument( + "command", + help="One of export|importonly|apply", + choices=["export", "importonly", "apply"], + ) + parser.add_argument("channel_id", help="The studio Channel ID to edit") + parser.add_argument( + "--primarykey", + help="Which idendifier to use when looking up nodes", + choices=["node_id", "content_id", "original_source_node_id", "studio_id"], + default="node_id", + ) + parser.add_argument("--gsheet_id", help="Google spreadsheets sheet ID (public)") + parser.add_argument( + "--gid", help="The gid argument to indicate which sheet", default="0" + ) + parser.add_argument( + "--modifyattrs", + help="Which attributes to modify", + default="title,description,author,copyright_holder", + ) args = parser.parse_args() # print("in corrections.main with cliargs", args) - if args.command == 'export': + if args.command == "export": export_corrections_csv(args) - elif args.command in ['importonly', 'apply']: + elif args.command in ["importonly", "apply"]: apply_corrections(args) else: - raise ValueError('Unrecognized command') + raise ValueError("Unrecognized command") -if __name__ == '__main__': +if __name__ == "__main__": correctionsmain() - diff --git a/ricecooker/utils/downloader.py b/ricecooker/utils/downloader.py index faa7e5f1..668681eb 100644 --- a/ricecooker/utils/downloader.py +++ b/ricecooker/utils/downloader.py @@ -4,42 +4,49 @@ import mimetypes import os import re -import requests +import selenium.webdriver.support.ui as selenium_ui import shutil import tempfile import time -from urllib.parse import urlparse, urljoin -from urllib.request import url2pathname import uuid +from selenium import webdriver +from urllib.parse import urljoin +from urllib.parse import urlparse +from urllib.request import url2pathname import chardet - +import requests from bs4 import BeautifulSoup -from selenium import webdriver -import selenium.webdriver.support.ui as selenium_ui from requests_file import FileAdapter -from ricecooker.config import LOGGER, PHANTOMJS_PATH, STRICT -from ricecooker.utils.html import download_file, replace_links -from ricecooker.utils.caching import CacheForeverHeuristic, FileCache, CacheControlAdapter, InvalidatingCacheControlAdapter + +from ricecooker.config import LOGGER +from ricecooker.config import PHANTOMJS_PATH +from ricecooker.config import STRICT +from ricecooker.utils.caching import CacheControlAdapter +from ricecooker.utils.caching import CacheForeverHeuristic +from ricecooker.utils.caching import FileCache +from ricecooker.utils.caching import InvalidatingCacheControlAdapter +from ricecooker.utils.html import download_file +from ricecooker.utils.html import replace_links from ricecooker.utils.zip import create_predictable_zip -DOWNLOAD_SESSION = requests.Session() # Session for downloading content from urls -DOWNLOAD_SESSION.mount('https://', requests.adapters.HTTPAdapter(max_retries=3)) -DOWNLOAD_SESSION.mount('file://', FileAdapter()) +DOWNLOAD_SESSION = requests.Session() # Session for downloading content from urls +DOWNLOAD_SESSION.mount("https://", requests.adapters.HTTPAdapter(max_retries=3)) +DOWNLOAD_SESSION.mount("file://", FileAdapter()) # use_dir_lock works with all filesystems and OSes -cache = FileCache('.webcache', use_dir_lock=True) -forever_adapter= CacheControlAdapter(heuristic=CacheForeverHeuristic(), cache=cache) +cache = FileCache(".webcache", use_dir_lock=True) +forever_adapter = CacheControlAdapter(heuristic=CacheForeverHeuristic(), cache=cache) # we can't use requests caching for pyppeteer / phantomjs, so track those separately. downloaded_pages = {} -DOWNLOAD_SESSION.mount('http://', forever_adapter) -DOWNLOAD_SESSION.mount('https://', forever_adapter) +DOWNLOAD_SESSION.mount("http://", forever_adapter) +DOWNLOAD_SESSION.mount("https://", forever_adapter) DEFAULT_HEADERS = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:20.0) Gecko/20100101 Firefox/20.0", "Accept-Encoding": "gzip, deflate", - "Connection": "keep-alive" + "Connection": "keep-alive", } @@ -55,20 +62,34 @@ from pyppeteer import launch, errors async def load_page(path, timeout=30, strict=True): - browser = await launch({'headless': True}) + browser = await launch({"headless": True}) content = None cookies = None page = None try: page = await browser.newPage() try: - await page.goto(path, {'timeout': timeout * 1000, 'waitUntil': ['load', 'domcontentloaded', 'networkidle0']}) + await page.goto( + path, + { + "timeout": timeout * 1000, + "waitUntil": ["load", "domcontentloaded", "networkidle0"], + }, + ) except errors.TimeoutError: # some sites have API calls running regularly, so the timeout may be that there's never any true # network idle time. Try 'networkidle2' option instead before determining we can't scrape. if not strict: - LOGGER.info("Attempting to download URL with networkidle2 instead of networkidle0...") - await page.goto(path, {'timeout': timeout * 1000, 'waitUntil': ['load', 'domcontentloaded', 'networkidle2']}) + LOGGER.info( + "Attempting to download URL with networkidle2 instead of networkidle0..." + ) + await page.goto( + path, + { + "timeout": timeout * 1000, + "waitUntil": ["load", "domcontentloaded", "networkidle2"], + }, + ) else: raise # get the entire rendered page, including the doctype @@ -78,36 +99,53 @@ async def load_page(path, timeout=30, strict=True): LOGGER.warning("Error scraping page: {}".format(e)) finally: await browser.close() - return content, {'cookies': cookies, 'url': path} + return content, {"cookies": cookies, "url": path} async def take_screenshot(url, filename, element=None, timeout=30): - browser = await launch({'headless': True}) + browser = await launch({"headless": True}) try: page = await browser.newPage() - await page.goto(url, - {'timeout': timeout * 1000, 'waitUntil': ['load', 'domcontentloaded', 'networkidle0']}) + await page.goto( + url, + { + "timeout": timeout * 1000, + "waitUntil": ["load", "domcontentloaded", "networkidle0"], + }, + ) screenshot_element = page if element: - await page.waitForSelector(element, {'timeout': 10000}) + await page.waitForSelector(element, {"timeout": 10000}) elements = await page.querySelectorAll(element) if len(list(elements)) > 1: - LOGGER.warning("Multiple elements matched screenshot element, using first...") + LOGGER.warning( + "Multiple elements matched screenshot element, using first..." + ) screenshot_element = elements[0] LOGGER.info("Saving screenshot to {}".format(filename)) - await screenshot_element.screenshot({'path': filename}) + await screenshot_element.screenshot({"path": filename}) finally: await page.close() await browser.close() + USE_PYPPETEER = True except: print("Unable to load pyppeteer, using phantomjs for JS loading.") pass -def read(path, loadjs=False, session=None, driver=None, timeout=60, - clear_cookies=True, loadjs_wait_time=3, loadjs_wait_for_callback=None, strict=True): +def read( + path, + loadjs=False, + session=None, + driver=None, + timeout=60, + clear_cookies=True, + loadjs_wait_time=3, + loadjs_wait_for_callback=None, + strict=True, +): """Reads from source and returns contents Args: @@ -133,7 +171,7 @@ def read(path, loadjs=False, session=None, driver=None, timeout=60, session = session or DOWNLOAD_SESSION try: - if loadjs: # Wait until js loads then return contents + if loadjs: # Wait until js loads then return contents if USE_PYPPETEER: content = asyncio.get_event_loop().run_until_complete(load_page(path)) return content @@ -148,17 +186,21 @@ def read(path, loadjs=False, session=None, driver=None, timeout=60, time.sleep(loadjs_wait_time) return driver.page_source - else: # Read page contents from url + else: # Read page contents from url response = make_request(path, clear_cookies, session=session) return response.content except (requests.exceptions.MissingSchema, requests.exceptions.InvalidSchema): - with open(path, 'rb') as fobj: # If path is a local file path, try to open the file + with open( + path, "rb" + ) as fobj: # If path is a local file path, try to open the file return fobj.read() -def make_request(url, clear_cookies=False, headers=None, timeout=60, session=None, *args, **kwargs): +def make_request( + url, clear_cookies=False, headers=None, timeout=60, session=None, *args, **kwargs +): sess = session or DOWNLOAD_SESSION if clear_cookies: @@ -173,16 +215,33 @@ def make_request(url, clear_cookies=False, headers=None, timeout=60, session=Non while retry_count <= max_retries: try: - response = sess.get(url, headers=request_headers, stream=True, timeout=timeout, *args, **kwargs) + response = sess.get( + url, + headers=request_headers, + stream=True, + timeout=timeout, + *args, + **kwargs + ) if response.status_code != 200: - LOGGER.error("{} error while trying to download {}".format(response.status_code, url)) + LOGGER.error( + "{} error while trying to download {}".format( + response.status_code, url + ) + ) if STRICT: response.raise_for_status() return response - except (requests.exceptions.ConnectionError, requests.exceptions.ReadTimeout) as e: + except ( + requests.exceptions.ConnectionError, + requests.exceptions.ReadTimeout, + ) as e: retry_count += 1 - LOGGER.warning("Error with connection ('{msg}'); about to perform retry {count} of {trymax}." - .format(msg=str(e), count=retry_count, trymax=max_retries)) + LOGGER.warning( + "Error with connection ('{msg}'); about to perform retry {count} of {trymax}.".format( + msg=str(e), count=retry_count, trymax=max_retries + ) + ) time.sleep(retry_count * 1) if retry_count > max_retries: LOGGER.error("Could not connect to: {}".format(url)) @@ -196,16 +255,26 @@ def make_request(url, clear_cookies=False, headers=None, timeout=60, session=Non # TODO(davidhu): Use MD5 hash of URL (ideally file) instead. def _derive_filename(url): - name = os.path.basename(urlparse(url).path).replace('%', '_') + name = os.path.basename(urlparse(url).path).replace("%", "_") return ("%s.%s" % (uuid.uuid4().hex, name)).lower() # TODO: The number of args and inner functions in this strongly suggest this needs # to be a class or have its functionality separated out. -def download_static_assets(doc, destination, base_url, - request_fn=make_request, url_blacklist=[], js_middleware=None, - css_middleware=None, derive_filename=_derive_filename, link_policy=None, - run_js=False, resource_urls=None, relative_links=False): +def download_static_assets( + doc, + destination, + base_url, + request_fn=make_request, + url_blacklist=[], + js_middleware=None, + css_middleware=None, + derive_filename=_derive_filename, + link_policy=None, + run_js=False, + resource_urls=None, + relative_links=False, +): """ Download all static assets referenced from an HTML page. The goal is to easily create HTML5 apps! Downloads JS, CSS, images, and @@ -231,8 +300,8 @@ def download_static_assets(doc, destination, base_url, extract the raw HTML.) """ # without the ending /, some functions will treat the last path component like a filename, so add it. - if not base_url.endswith('/'): - base_url += '/' + if not base_url.endswith("/"): + base_url += "/" LOGGER.debug("base_url = {}".format(base_url)) @@ -244,7 +313,7 @@ def download_srcset(selector, attr, content_middleware=None): for i, node in enumerate(nodes): srcset = node[attr] - sources = srcset.split(',') + sources = srcset.split(",") new_sources = [] for source in sources: # a source can be just a URL, or a URL + a space character and then a width or resolution. @@ -254,22 +323,30 @@ def download_srcset(selector, attr, content_middleware=None): new_url = filename if relative_links and base_url: base_filename = derive_filename(base_url) - new_url = get_relative_url_for_archive_filename(filename, base_filename) + new_url = get_relative_url_for_archive_filename( + filename, base_filename + ) fullpath = os.path.join(destination, filename) if not os.path.exists(fullpath): LOGGER.info("Downloading {} to filename {}".format(url, fullpath)) - download_file(url, destination, request_fn=request_fn, - filename=filename, middleware_callbacks=content_middleware) + download_file( + url, + destination, + request_fn=request_fn, + filename=filename, + middleware_callbacks=content_middleware, + ) if len(parts) > 1: - new_sources.append(" ".join([new_url, parts[1]])) + new_sources.append(" ".join([new_url, parts[1]])) else: new_sources.append(new_url) - node[attr] = ', '.join(new_sources) + node[attr] = ", ".join(new_sources) # Helper function to download all assets for a given CSS selector. - def download_assets(selector, attr, url_middleware=None, - content_middleware=None, node_filter=None): + def download_assets( + selector, attr, url_middleware=None, content_middleware=None, node_filter=None + ): nodes = doc.select(selector) for i, node in enumerate(nodes): @@ -277,17 +354,17 @@ def download_assets(selector, attr, url_middleware=None, if node_filter: if not node_filter(node): src = node[attr] - node[attr] = '' - print(' Skipping node with src ', src) + node[attr] = "" + print(" Skipping node with src ", src) continue - if node[attr].startswith('data:'): + if node[attr].startswith("data:"): continue url = urljoin(base_url, node[attr]) if _is_blacklisted(url, url_blacklist): - LOGGER.info(' Skipping downloading blacklisted url', url) + LOGGER.info(" Skipping downloading blacklisted url", url) node[attr] = "" continue @@ -300,13 +377,13 @@ def download_assets(selector, attr, url_middleware=None, # This COULD be an index file in a dir, or just a file with no extension. Handle either case by # turning the path into filename + '/index' + the file extension from the content type response = requests.get(url) - type = response.headers['content-type'].split(';')[0] + type = response.headers["content-type"].split(";")[0] ext = mimetypes.guess_extension(type) # if we're really stuck, just default to HTML as that is most likely if this is a redirect. if not ext: - ext = '.html' + ext = ".html" subpath = os.path.dirname(filename) - filename = 'index{}'.format(ext) + filename = "index{}".format(ext) os.makedirs(os.path.join(destination, subpath), exist_ok=True) @@ -319,21 +396,25 @@ def download_assets(selector, attr, url_middleware=None, fullpath = os.path.join(destination, filename) if not os.path.exists(fullpath): LOGGER.info("Downloading {} to filename {}".format(url, fullpath)) - download_file(url, destination, request_fn=request_fn, - filename=filename, middleware_callbacks=content_middleware) + download_file( + url, + destination, + request_fn=request_fn, + filename=filename, + middleware_callbacks=content_middleware, + ) elif content_middleware: # Make sure we run middleware, as it creates a list of file dependencies that we need when # converting the content into a zip file. # TODO: We should probably separate out the download step from the middleware step, so # that middleware can be run regardless of how we get the content. - content = open(fullpath, 'r', encoding='utf-8').read() + content = open(fullpath, "r", encoding="utf-8").read() new_content = content_middleware(content, url) if new_content != content: # if the middleware changed the content, update it. - with open(fullpath, 'w') as f: + with open(fullpath, "w") as f: f.write(new_content) - def js_content_middleware(content, url, **kwargs): if js_middleware: content = js_middleware(content, url, **kwargs) @@ -354,35 +435,39 @@ def css_content_middleware(content, url, **kwargs): def repl(match): src = match.group(1) - if src.startswith('//localhost'): - return 'url()' + if src.startswith("//localhost"): + return "url()" # Don't download data: files - if src.startswith('data:'): + if src.startswith("data:"): return match.group(0) parts = urlparse(src) root_url = None if url: - root_url = url[:url.rfind('/') + 1] + root_url = url[: url.rfind("/") + 1] if parts.scheme and parts.netloc: src_url = src - elif parts.path.startswith('/') and url: - src_url = '{}://{}{}'.format(root_parts.scheme, root_parts.netloc, parts.path) + elif parts.path.startswith("/") and url: + src_url = "{}://{}{}".format( + root_parts.scheme, root_parts.netloc, parts.path + ) elif url and root_url: src_url = urljoin(root_url, src) else: src_url = urljoin(base_url, src) if _is_blacklisted(src_url, url_blacklist): - print(' Skipping downloading blacklisted url', src_url) - return 'url()' + print(" Skipping downloading blacklisted url", src_url) + return "url()" derived_filename = derive_filename(src_url) new_url = src - if url and parts.path.startswith('/') or relative_links: + if url and parts.path.startswith("/") or relative_links: page_filename = derive_filename(url) - new_url = get_relative_url_for_archive_filename(derived_filename, page_filename) + new_url = get_relative_url_for_archive_filename( + derived_filename, page_filename + ) elif derive_filename == _derive_filename: # The _derive_filename function puts all files in the root, so all URLs need # rewritten. When using get_archive_filename, relative URLs will still work. @@ -390,10 +475,16 @@ def repl(match): fullpath = os.path.join(destination, derived_filename) if not os.path.exists(fullpath): - download_file(src_url, destination, request_fn=request_fn, - filename=derived_filename) + download_file( + src_url, + destination, + request_fn=request_fn, + filename=derived_filename, + ) else: - LOGGER.debug("Resource already downloaded, skipping: {}".format(src_url)) + LOGGER.debug( + "Resource already downloaded, skipping: {}".format(src_url) + ) return 'url("%s")' % new_url return _CSS_URL_RE.sub(repl, content) @@ -401,45 +492,51 @@ def repl(match): # Download all linked static assets. download_assets("img[src]", "src") # Images download_srcset("img[srcset]", "srcset") # Images - download_assets("link[href]", "href", - content_middleware=css_content_middleware, - node_filter=css_node_filter) # CSS - download_assets("script[src]", "src", - content_middleware=js_content_middleware) # JS - download_assets("source[src]", "src") # Potentially audio - download_srcset("source[srcset]", "srcset") # Potentially audio + download_assets( + "link[href]", + "href", + content_middleware=css_content_middleware, + node_filter=css_node_filter, + ) # CSS + download_assets( + "script[src]", "src", content_middleware=js_content_middleware + ) # JS + download_assets("source[src]", "src") # Potentially audio + download_srcset("source[srcset]", "srcset") # Potentially audio # Link scraping can be expensive, so it's off by default. We decrement the levels value every time we recurse # so skip once we hit zero. - if link_policy is not None and link_policy['levels'] > 0: + if link_policy is not None and link_policy["levels"] > 0: nodes = doc.select("iframe[src]") nodes += doc.select("a[href]") # TODO: add "a[href]" handling to this and/or ways to whitelist / blacklist tags and urls for node in nodes: url = None - if node.name == 'iframe': - url = node['src'] - elif node.name == 'a': - url = node['href'] + if node.name == "iframe": + url = node["src"] + elif node.name == "a": + url = node["href"] assert url is not None - download_url = url.split('#')[0] # Ignore bookmarks in URL + download_url = url.split("#")[0] # Ignore bookmarks in URL if download_url.strip() == "": continue parts = urlparse(download_url) # if we're scraping links, always scrape relative links regardless of setting. - should_scrape = 'all' in link_policy['scope'] or (not parts.scheme and not parts.netloc) - if not parts.scheme or parts.scheme.startswith('http'): + should_scrape = "all" in link_policy["scope"] or ( + not parts.scheme and not parts.netloc + ) + if not parts.scheme or parts.scheme.startswith("http"): LOGGER.debug("checking url: {}".format(url)) if not parts.netloc: download_url = urljoin(base_url, download_url) - if 'whitelist' in link_policy: - for whitelist_item in link_policy['whitelist']: + if "whitelist" in link_policy: + for whitelist_item in link_policy["whitelist"]: if whitelist_item in download_url: should_scrape = True break - if 'blacklist' in link_policy: - for blacklist_item in link_policy['blacklist']: + if "blacklist" in link_policy: + for blacklist_item in link_policy["blacklist"]: if blacklist_item in download_url: should_scrape = False break @@ -447,22 +544,41 @@ def repl(match): if should_scrape: policy = copy.copy(link_policy) # make sure we reduce the depth level by one each time we recurse - policy['levels'] -= 1 + policy["levels"] -= 1 # no extension is most likely going to return HTML as well. - is_html = os.path.splitext(download_url)[1] in ['.htm', '.html', '.xhtml', ''] + is_html = os.path.splitext(download_url)[1] in [ + ".htm", + ".html", + ".xhtml", + "", + ] derived_filename = derive_filename(download_url) new_url = derived_filename if is_html: if not download_url in downloaded_pages: - LOGGER.info("Downloading linked HTML page {}".format(download_url)) + LOGGER.info( + "Downloading linked HTML page {}".format(download_url) + ) global archiver if archiver: - info = archiver.get_page(download_url, link_policy=policy, run_js=run_js) - filename = info['index_path'].replace(archiver.root_dir + os.sep, '') + info = archiver.get_page( + download_url, link_policy=policy, run_js=run_js + ) + filename = info["index_path"].replace( + archiver.root_dir + os.sep, "" + ) else: - info = archive_page(download_url, destination, link_policy=policy, run_js=run_js, relative_links=relative_links) - filename = info['index_path'].replace(destination + os.sep, '') + info = archive_page( + download_url, + destination, + link_policy=policy, + run_js=run_js, + relative_links=relative_links, + ) + filename = info["index_path"].replace( + destination + os.sep, "" + ) new_url = filename downloaded_pages[download_url] = new_url @@ -475,7 +591,9 @@ def repl(match): if relative_links and base_url: page_filename = derive_filename(base_url) - new_url = get_relative_url_for_archive_filename(new_url, page_filename) + new_url = get_relative_url_for_archive_filename( + new_url, page_filename + ) else: full_path = os.path.join(destination, derived_filename) new_url = derived_filename @@ -485,19 +603,19 @@ def repl(match): else: LOGGER.info("File already downloaded, skipping: {}".format(url)) - if node.name == 'iframe': - node['src'] = new_url - elif node.name == 'a': - node['href'] = new_url + if node.name == "iframe": + node["src"] = new_url + elif node.name == "a": + node["href"] = new_url # ... and also run the middleware on CSS/JS embedded in the page source to # get linked files. - for node in doc.select('style'): - node.string = css_content_middleware(node.get_text(), url='') + for node in doc.select("style"): + node.string = css_content_middleware(node.get_text(), url="") - for node in doc.select('script'): - if not node.attrs.get('src'): - node.string = js_content_middleware(node.get_text(), url='') + for node in doc.select("script"): + if not node.attrs.get("src"): + node.string = js_content_middleware(node.get_text(), url="") return doc @@ -526,14 +644,18 @@ def get_archive_filename(url, page_url=None, download_root=None, resource_urls=N if file_url_parsed.query: # Append the query to the filename, so that the filename is unique for each set of params. - query_string = "_{}".format(file_url_parsed.query.replace('=', '_').replace('&', '_')) + query_string = "_{}".format( + file_url_parsed.query.replace("=", "_").replace("&", "_") + ) local_path = _path + query_string + ext LOGGER.debug("local_path is now {}".format(local_path)) local_dir_name = local_path - if ext != '': + if ext != "": local_dir_name = os.path.dirname(local_path) - LOGGER.debug("local_path = {}, local_dir_name = {}".format(local_path, local_dir_name)) + LOGGER.debug( + "local_path = {}, local_dir_name = {}".format(local_path, local_dir_name) + ) if local_dir_name != local_path and resource_urls is not None: full_dir = os.path.join(download_root, local_dir_name) @@ -543,19 +665,26 @@ def get_archive_filename(url, page_url=None, download_root=None, resource_urls=N # Right now, this code depends on any file links having an extension, as in this function # we don't know the mimetype of the resource yet. We should probably pass in mimetype to this # function so we can construct filenames for extensionless URLs. - if os.path.splitext(local_path)[1].strip() != '': + if os.path.splitext(local_path)[1].strip() != "": LOGGER.debug("replacing {} with {}".format(url, local_path)) resource_urls[url] = local_path return local_path def get_relative_url_for_archive_filename(filename, relative_to): - if os.path.isfile(relative_to) or os.path.splitext(relative_to)[1] != '': + if os.path.isfile(relative_to) or os.path.splitext(relative_to)[1] != "": relative_to = os.path.dirname(relative_to) return os.path.relpath(filename, relative_to).replace("\\", "/") -def archive_page(url, download_root, link_policy=None, run_js=False, strict=False, relative_links=False): +def archive_page( + url, + download_root, + link_policy=None, + run_js=False, + strict=False, + relative_links=False, +): """ Download fully rendered page and all related assets into ricecooker's site archive format. @@ -571,27 +700,35 @@ def archive_page(url, download_root, link_policy=None, run_js=False, strict=Fals os.makedirs(download_root, exist_ok=True) if run_js: - content, props = asyncio.get_event_loop().run_until_complete(load_page(url, strict=strict)) + content, props = asyncio.get_event_loop().run_until_complete( + load_page(url, strict=strict) + ) else: response = make_request(url) - props = {'cookies': requests.utils.dict_from_cookiejar(response.cookies), 'url': response.url} - if not 'charset' in response.headers['Content-Type']: + props = { + "cookies": requests.utils.dict_from_cookiejar(response.cookies), + "url": response.url, + } + if not "charset" in response.headers["Content-Type"]: # It seems requests defaults to ISO-8859-1 when the headers don't explicitly declare an # encoding. In this case, we're better off using chardet to guess instead. encoding = chardet.detect(response.content) - if encoding and 'encoding' in encoding: - response.encoding = encoding['encoding'] + if encoding and "encoding" in encoding: + response.encoding = encoding["encoding"] LOGGER.warning("Encoding = {}".format(response.encoding)) content = response.text # url may be redirected, for relative link handling we want the final URL that was loaded. - url = props['url'] + url = props["url"] # get related assets parts = urlparse(url) if not parts.scheme: - parts.scheme = 'https' - base_url = urljoin("{}://{}".format(parts.scheme, parts.netloc), parts.path[:parts.path.rfind('/')]) + parts.scheme = "https" + base_url = urljoin( + "{}://{}".format(parts.scheme, parts.netloc), + parts.path[: parts.path.rfind("/")], + ) resource_urls = {} if content: @@ -600,40 +737,55 @@ def archive_page(url, download_root, link_policy=None, run_js=False, strict=Fals def get_resource_filename(url): return get_archive_filename(url, page_url, download_root, resource_urls) - doc = download_static_assets(content, download_root, base_url, derive_filename=get_resource_filename, - link_policy=link_policy, run_js=run_js, resource_urls=resource_urls, - relative_links=relative_links) - download_path = os.path.join(download_root, get_archive_filename(url, page_url, download_root)) + doc = download_static_assets( + content, + download_root, + base_url, + derive_filename=get_resource_filename, + link_policy=link_policy, + run_js=run_js, + resource_urls=resource_urls, + relative_links=relative_links, + ) + + download_path = os.path.join( + download_root, get_archive_filename(url, page_url, download_root) + ) _path, ext = os.path.splitext(download_path) index_path = download_path - if '.htm' not in ext: - if page_url.endswith('/'): - index_path = download_path + 'index.html' + if ".htm" not in ext: + if page_url.endswith("/"): + index_path = download_path + "index.html" else: - index_path = download_path + '.html' + index_path = download_path + ".html" index_dir = os.path.dirname(index_path) new_content = doc.prettify() # Replace any links with relative links that we haven't changed already. # TODO: Find a way to determine when this check is no longer needed. - new_content = replace_links(new_content, resource_urls, download_root, index_dir, relative_links=relative_links) + new_content = replace_links( + new_content, + resource_urls, + download_root, + index_dir, + relative_links=relative_links, + ) os.makedirs(index_dir, exist_ok=True) - soup = BeautifulSoup(new_content, features='lxml') - f = open(index_path, 'wb') + soup = BeautifulSoup(new_content, features="lxml") + f = open(index_path, "wb") f.write(soup.prettify(encoding="utf-8")) f.close() - page_info = { - 'url': url, - 'cookies': props['cookies'], - 'index_path': index_path, - 'resources': list(resource_urls.values()), - 'resource_urls': resource_urls + "url": url, + "cookies": props["cookies"], + "index_path": index_path, + "resources": list(resource_urls.values()), + "resource_urls": resource_urls, } LOGGER.info("archive_page finished...") return page_info @@ -683,7 +835,7 @@ def download_in_parallel(urls, func=None, max_workers=5): class ArchiveDownloader: def __init__(self, root_dir, relative_links=True): self.root_dir = root_dir - self.cache_file = os.path.join(self.root_dir, 'archive_files.json') + self.cache_file = os.path.join(self.root_dir, "archive_files.json") self.cache_data = {} # This is temporarily configurable for ArchiveDownloader-based chefs that @@ -702,51 +854,64 @@ def __del__(self): archiver = None def save_cache_data(self): - with open(self.cache_file, 'w') as f: + with open(self.cache_file, "w") as f: f.write(json.dumps(self.cache_data, ensure_ascii=False, indent=2)) def clear_cache_data(self): self.cache_data = {} self.save_cache_data() - def get_page(self, url, refresh=False, link_policy=None, run_js=False, strict=False): + def get_page( + self, url, refresh=False, link_policy=None, run_js=False, strict=False + ): if refresh or not url in self.cache_data: - self.cache_data[url] = archive_page(url, download_root=self.root_dir, link_policy=link_policy, run_js=run_js, strict=strict, relative_links=self.relative_links) + self.cache_data[url] = archive_page( + url, + download_root=self.root_dir, + link_policy=link_policy, + run_js=run_js, + strict=strict, + relative_links=self.relative_links, + ) self.save_cache_data() return self.cache_data[url] def get_relative_index_path(self, url): - if url in self.cache_data and 'index_path' in self.cache_data[url]: + if url in self.cache_data and "index_path" in self.cache_data[url]: if not self.relative_links: # we copy the main page to index.html in the root of the page archive. return "index.html" - return self.cache_data[url]['index_path'].replace(self.root_dir + os.sep, '') + return self.cache_data[url]["index_path"].replace( + self.root_dir + os.sep, "" + ) return None def find_page_by_index_path(self, index_path): for url in self.cache_data: - if self.cache_data[url]['index_path'] == index_path: + if self.cache_data[url]["index_path"] == index_path: return self.cache_data[url] return None def get_page_soup(self, url): if not url in self.cache_data: - raise KeyError("Unable to find page {} in archive. Did you call get_page?".format(url)) + raise KeyError( + "Unable to find page {} in archive. Did you call get_page?".format(url) + ) info = self.cache_data[url] # lxml enables some nice features like being able to search for individual # class names using BeautifulSoup, so let's just require it. - soup = BeautifulSoup(open(info['index_path'], 'rb'), features='lxml') + soup = BeautifulSoup(open(info["index_path"], "rb"), features="lxml") return soup def create_dependency_zip(self, count_threshold=2): resource_counts = {} for url in self.cache_data: info = self.cache_data[url] - resources = info['resources'] + resources = info["resources"] for resource in resources.values(): if not resource in resource_counts: resource_counts[resource] = 0 @@ -767,8 +932,8 @@ def _copy_resources_to_dir(self, base_dir, resources): for res in resources: res_path = res if res_path.startswith(self.root_dir): - res_path = res_path.replace(self.root_dir, '') - if res_path.startswith('/'): + res_path = res_path.replace(self.root_dir, "") + if res_path.startswith("/"): res_path = res_path[1:] full_path = os.path.join(self.root_dir, res_path) dest_path = os.path.join(base_dir, res_path) @@ -778,21 +943,23 @@ def _copy_resources_to_dir(self, base_dir, resources): def create_zip_dir_for_page(self, url): if not url in self.cache_data: - raise KeyError("Please ensure you call get_page before calling this function to download the content.") + raise KeyError( + "Please ensure you call get_page before calling this function to download the content." + ) temp_dir = tempfile.mkdtemp() info = self.cache_data[url] # TODO: Add dependency zip handling that replaces links with the dependency zip location - self._copy_resources_to_dir(temp_dir, info['resources']) - for res_url in info['resource_urls']: + self._copy_resources_to_dir(temp_dir, info["resources"]) + for res_url in info["resource_urls"]: if res_url in self.cache_data: - resources = self.cache_data[res_url]['resources'] + resources = self.cache_data[res_url]["resources"] self._copy_resources_to_dir(temp_dir, resources) index_path = self.get_relative_index_path(url) os.makedirs(os.path.dirname(os.path.join(temp_dir, index_path)), exist_ok=True) - shutil.copy(info['index_path'], os.path.join(temp_dir, index_path)) + shutil.copy(info["index_path"], os.path.join(temp_dir, index_path)) return temp_dir def export_page_as_zip(self, url): diff --git a/ricecooker/utils/encodings.py b/ricecooker/utils/encodings.py index cefcae37..b323a329 100644 --- a/ricecooker/utils/encodings.py +++ b/ricecooker/utils/encodings.py @@ -1,24 +1,25 @@ -import re import base64 +import re -BASE64_REGEX_STR = r'data:image\/([A-Za-z]*);base64,((?:[A-Za-z0-9+\/]{4})*(?:[A-Za-z0-9+\/]{2}==|[A-Za-z0-9+\/]{3}=)*)' +BASE64_REGEX_STR = r"data:image\/([A-Za-z]*);base64,((?:[A-Za-z0-9+\/]{4})*(?:[A-Za-z0-9+\/]{2}==|[A-Za-z0-9+\/]{3}=)*)" BASE64_REGEX = re.compile(BASE64_REGEX_STR, flags=re.IGNORECASE) def get_base64_encoding(text): - """ get_base64_encoding: Get the first base64 match or None - Args: - text (str): text to check for base64 encoding - Returns: First match in text + """get_base64_encoding: Get the first base64 match or None + Args: + text (str): text to check for base64 encoding + Returns: First match in text """ return BASE64_REGEX.search(text) + def write_base64_to_file(encoding, fpath_out): - """ write_base64_to_file: Convert base64 image to file - Args: - encoding (str): base64 encoded string - fpath_out (str): path to file to write - Returns: None + """write_base64_to_file: Convert base64 image to file + Args: + encoding (str): base64 encoded string + fpath_out (str): path to file to write + Returns: None """ encoding_match = get_base64_encoding(encoding) @@ -26,14 +27,15 @@ def write_base64_to_file(encoding, fpath_out): assert encoding_match, "Error writing to file: Invalid base64 encoding" with open(fpath_out, "wb") as target_file: - target_file.write(base64.decodebytes(encoding_match.group(2).encode('utf-8'))) + target_file.write(base64.decodebytes(encoding_match.group(2).encode("utf-8"))) + def encode_file_to_base64(fpath_in, prefix): - """ encode_file_to_base64: gets base64 encoding of file - Args: - fpath_in (str): path to file to encode - prefix (str): file data for encoding (e.g. 'data:image/png;base64,') - Returns: base64 encoding of file + """encode_file_to_base64: gets base64 encoding of file + Args: + fpath_in (str): path to file to encode + prefix (str): file data for encoding (e.g. 'data:image/png;base64,') + Returns: base64 encoding of file """ - with open(fpath_in, 'rb') as file_obj: - return prefix + base64.b64encode(file_obj.read()).decode('utf-8') + with open(fpath_in, "rb") as file_obj: + return prefix + base64.b64encode(file_obj.read()).decode("utf-8") diff --git a/ricecooker/utils/html.py b/ricecooker/utils/html.py index c2920855..f2325de2 100644 --- a/ricecooker/utils/html.py +++ b/ricecooker/utils/html.py @@ -1,51 +1,56 @@ import logging import os import re -import requests import signal import time import urllib - -import chardet - -from bs4 import BeautifulSoup from selenium import webdriver -from urllib.parse import urlparse, unquote +from urllib.parse import unquote +from urllib.parse import urlparse from urllib.request import pathname2url -from .caching import FileCache, CacheControlAdapter -from ricecooker.config import LOGGER, PHANTOMJS_PATH, STRICT +import chardet +import requests +from bs4 import BeautifulSoup +from .caching import CacheControlAdapter +from .caching import FileCache +from ricecooker.config import LOGGER +from ricecooker.config import PHANTOMJS_PATH +from ricecooker.config import STRICT # create a default session with basic caching mechanisms (similar to what a browser would do) sess = requests.Session() -cache = FileCache('.webcache', use_dir_lock=True) +cache = FileCache(".webcache", use_dir_lock=True) basic_adapter = CacheControlAdapter(cache=cache) -sess.mount('http://', basic_adapter) -sess.mount('https://', basic_adapter) +sess.mount("http://", basic_adapter) +sess.mount("https://", basic_adapter) if PHANTOMJS_PATH is None: - PHANTOMJS_PATH = os.path.join(os.getcwd(), "node_modules", "phantomjs-prebuilt", "bin", "phantomjs") + PHANTOMJS_PATH = os.path.join( + os.getcwd(), "node_modules", "phantomjs-prebuilt", "bin", "phantomjs" + ) class WebDriver(object): - def __init__(self, url, delay=1000): self.url = url self.delay = delay def __enter__(self): if not os.path.isfile(PHANTOMJS_PATH): - raise Exception("You must install phantomjs-prebuilt in the directory" - " you're running in with `npm install phantomjs-prebuilt`" - " or set the environment variable `PHANTOMJS_PATH`") + raise Exception( + "You must install phantomjs-prebuilt in the directory" + " you're running in with `npm install phantomjs-prebuilt`" + " or set the environment variable `PHANTOMJS_PATH`" + ) self.driver = webdriver.PhantomJS(executable_path=PHANTOMJS_PATH) self.driver.get(self.url) time.sleep(self.delay / 1000.0) return self.driver - def __exit__(self ,type, value, traceback): + def __exit__(self, type, value, traceback): # driver.quit() by itself doesn't suffice to fully terminate spawned # PhantomJS processes: # see https://github.com/seleniumhq/selenium/issues/767 @@ -54,10 +59,16 @@ def __exit__(self ,type, value, traceback): def get_generated_html_from_driver(driver, tagname="html"): - driver.execute_script("return document.getElementsByTagName('{tagname}')[0].innerHTML".format(tagname=tagname)) + driver.execute_script( + "return document.getElementsByTagName('{tagname}')[0].innerHTML".format( + tagname=tagname + ) + ) -def replace_links(content, urls_to_replace, download_root=None, content_dir=None, relative_links=False): +def replace_links( + content, urls_to_replace, download_root=None, content_dir=None, relative_links=False +): for key in urls_to_replace: value = urls_to_replace[key] if key == value: @@ -76,16 +87,18 @@ def replace_links(content, urls_to_replace, download_root=None, content_dir=None rel_path = pathname2url(rel_path) if relative_links: - value = pathname2url(os.path.relpath(os.path.join(download_root, value), content_dir)) + value = pathname2url( + os.path.relpath(os.path.join(download_root, value), content_dir) + ) # When we get an absolute URL, it may appear in one of three different ways in the page: key_variants = [ # 1. /path/to/file.html - key.replace(url_parts.scheme + '://' + url_parts.netloc, ''), + key.replace(url_parts.scheme + "://" + url_parts.netloc, ""), # 2. https://www.domain.com/path/to/file.html key, # 3. //www.domain.com/path/to/file.html - key.replace(url_parts.scheme + ':', ''), + key.replace(url_parts.scheme + ":", ""), ] if rel_path and content_dir: @@ -107,7 +120,9 @@ def replace_links(content, urls_to_replace, download_root=None, content_dir=None # we avoid using BeautifulSoup because Python HTML parsers can be destructive and # do things like strip out the doctype. content = content.replace('="{}"'.format(variant), '="{}"'.format(value)) - content = content.replace('url({})'.format(variant), 'url({})'.format(value)) + content = content.replace( + "url({})".format(variant), "url({})".format(value) + ) for match in srcset_links: url = match[1] @@ -145,9 +160,13 @@ def calculate_relative_url(url, filename=None, baseurl=None, subpath=None): # if a base path was supplied, calculate the file's subpath relative to it if baseurl: - baseurl = urllib.parse.urljoin(baseurl, ".") # ensure baseurl is normalized (to remove '/./' and '/../') - assert url.startswith(baseurl), "URL {} must start with baseurl {}".format(url, baseurl) - subpath = subpath + url[len(baseurl):].strip("/").split("/")[:-1] + baseurl = urllib.parse.urljoin( + baseurl, "." + ) # ensure baseurl is normalized (to remove '/./' and '/../') + assert url.startswith(baseurl), "URL {} must start with baseurl {}".format( + url, baseurl + ) + subpath = subpath + url[len(baseurl) :].strip("/").split("/")[:-1] # if we don't have a filename, extract it from the URL if not filename: @@ -159,7 +178,16 @@ def calculate_relative_url(url, filename=None, baseurl=None, subpath=None): return relative_file_url, subpath, filename -def download_file(url, destpath, filename=None, baseurl=None, subpath=None, middleware_callbacks=None, middleware_kwargs=None, request_fn=sess.get): +def download_file( + url, + destpath, + filename=None, + baseurl=None, + subpath=None, + middleware_callbacks=None, + middleware_kwargs=None, + request_fn=sess.get, +): """ Download a file from a URL, into a destination folder, with optional use of relative paths and middleware processors. @@ -170,7 +198,9 @@ def download_file(url, destpath, filename=None, baseurl=None, subpath=None, midd - If `middleware_kwargs` are also specified, they will also be passed in to each function in middleware_callbacks. """ - relative_file_url, subpath, filename = calculate_relative_url(url, filename=filename, baseurl=baseurl, subpath=subpath) + relative_file_url, subpath, filename = calculate_relative_url( + url, filename=filename, baseurl=baseurl, subpath=subpath + ) LOGGER.info("Download called for {}".format(url)) # ensure that the destination directory exists @@ -188,18 +218,20 @@ def download_file(url, destpath, filename=None, baseurl=None, subpath=None, midd # if there are any middleware callbacks, apply them to the content if middleware_callbacks: - if 'content-type' in response.headers: - type = response.headers['content-type'].split(';')[0] + if "content-type" in response.headers: + type = response.headers["content-type"].split(";")[0] # Rely on requests to convert bytes to unicode for us when it's a text file # otherwise, we just use bytes - if type.startswith('text'): + if type.startswith("text"): # It seems requests defaults to ISO-8859-1 when the headers don't explicitly declare an # encoding. In this case, we're better off using chardet to guess instead. if not response.encoding: encoding = chardet.detect(response.content) - if encoding and 'encoding' in encoding: - response.encoding = encoding['encoding'] - LOGGER.warning("encoding for {} = {}".format(url, response.encoding)) + if encoding and "encoding" in encoding: + response.encoding = encoding["encoding"] + LOGGER.warning( + "encoding for {} = {}".format(url, response.encoding) + ) content = response.text if not isinstance(middleware_callbacks, list): @@ -219,7 +251,7 @@ def download_file(url, destpath, filename=None, baseurl=None, subpath=None, midd # ensure content is encoded, as we're doing a binary write if isinstance(content, str): - content = content.encode('utf-8') + content = content.encode("utf-8") # calculate the final destination for the file, and write the content out to there dest = os.path.join(fulldestpath, filename) diff --git a/ricecooker/utils/html_writer.py b/ricecooker/utils/html_writer.py index 8b12d8e1..7792d823 100644 --- a/ricecooker/utils/html_writer.py +++ b/ricecooker/utils/html_writer.py @@ -1,20 +1,22 @@ import os import zipfile + from ricecooker.utils.downloader import read -class HTMLWriter(): + +class HTMLWriter: """ - Class for writing zipfiles + Class for writing zipfiles """ - zf = None # Zip file to write to - write_to_path = None # Where to write zip file + zf = None # Zip file to write to + write_to_path = None # Where to write zip file def __init__(self, write_to_path, mode="w"): """ Args: write_to_path: (str) where to write zip file """ - self.map = {} # Keeps track of content to write to csv + self.map = {} # Keeps track of content to write to csv self.write_to_path = write_to_path # Where to write zip file - self.mode = mode # What mode to open zipfile in + self.mode = mode # What mode to open zipfile in def __enter__(self): """ Called when opening context (e.g. with HTMLWriter() as writer: ) """ @@ -41,49 +43,55 @@ def _copy_to_zipfile(self, filepath, arcname=None): """ USER-FACING METHODS """ def open(self): - """ open: Opens zipfile to write to - Args: None - Returns: None + """open: Opens zipfile to write to + Args: None + Returns: None """ self.zf = zipfile.ZipFile(self.write_to_path, self.mode) def close(self): - """ close: Close zipfile when done - Args: None - Returns: None + """close: Close zipfile when done + Args: None + Returns: None """ - index_present = self.contains('index.html') - self.zf.close() # Make sure zipfile closes no matter what + index_present = self.contains("index.html") + self.zf.close() # Make sure zipfile closes no matter what if not index_present: - raise ReferenceError("Invalid Zip at {}: missing index.html file (use write_index_contents method)".format(self.write_to_path)) + raise ReferenceError( + "Invalid Zip at {}: missing index.html file (use write_index_contents method)".format( + self.write_to_path + ) + ) def contains(self, filename): - """ contains: Checks if filename is in the zipfile - Args: filename: (str) name of file to check - Returns: boolean indicating whether or not filename is in the zip + """contains: Checks if filename is in the zipfile + Args: filename: (str) name of file to check + Returns: boolean indicating whether or not filename is in the zip """ return filename in self.zf.namelist() def write_contents(self, filename, contents, directory=None): - """ write_contents: Write contents to filename in zip - Args: - contents: (str) contents of file - filename: (str) name of file in zip - directory: (str) directory in zipfile to write file to (optional) - Returns: path to file in zip + """write_contents: Write contents to filename in zip + Args: + contents: (str) contents of file + filename: (str) name of file in zip + directory: (str) directory in zipfile to write file to (optional) + Returns: path to file in zip """ - filepath = "{}/{}".format(directory.rstrip("/"), filename) if directory else filename + filepath = ( + "{}/{}".format(directory.rstrip("/"), filename) if directory else filename + ) self._write_to_zipfile(filepath, contents) return filepath def write_file(self, filepath, filename=None, directory=None): - """ write_file: Write local file to zip - Args: - filepath: (str) location to local file - directory: (str) directory in zipfile to write file to (optional) - Returns: path to file in zip + """write_file: Write local file to zip + Args: + filepath: (str) location to local file + directory: (str) directory in zipfile to write file to (optional) + Returns: path to file in zip - Note: filepath must be a relative path + Note: filepath must be a relative path """ arcname = None if filename or directory: @@ -94,22 +102,24 @@ def write_file(self, filepath, filename=None, directory=None): return arcname or filepath def write_url(self, url, filename, directory=None): - """ write_url: Write contents from url to filename in zip - Args: - url: (str) url to file to download - filename: (str) name of file in zip - directory: (str) directory in zipfile to write file to (optional) - Returns: path to file in zip + """write_url: Write contents from url to filename in zip + Args: + url: (str) url to file to download + filename: (str) name of file in zip + directory: (str) directory in zipfile to write file to (optional) + Returns: path to file in zip """ - filepath = "{}/{}".format(directory.rstrip("/"), filename) if directory else filename + filepath = ( + "{}/{}".format(directory.rstrip("/"), filename) if directory else filename + ) if not self.contains(filepath): self._write_to_zipfile(filepath, read(url)) return filepath def write_index_contents(self, contents): - """ write_index_contents: Write main index file to zip - Args: - contents: (str) contents of file - Returns: path to file in zip + """write_index_contents: Write main index file to zip + Args: + contents: (str) contents of file + Returns: path to file in zip """ - self._write_to_zipfile('index.html', contents) + self._write_to_zipfile("index.html", contents) diff --git a/ricecooker/utils/images.py b/ricecooker/utils/images.py index 3fccd630..3ac77425 100644 --- a/ricecooker/utils/images.py +++ b/ricecooker/utils/images.py @@ -1,22 +1,20 @@ import os import zipfile -import ebooklib -import ebooklib.epub from io import BytesIO - +import ebooklib.epub from pdf2image import convert_from_path from PIL import Image from .thumbscropping import scale_and_crop - # SMARTCROP UTILS ################################################################################ THUMBNAIL_SIZE = (400, 225) # 16:9 aspect ratio + def scale_and_crop_thumbnail(image, size=THUMBNAIL_SIZE, crop="smart", **kwargs): """ Scale and crop the PIL Image ``image`` to maximum dimensions of ``size``. @@ -32,10 +30,10 @@ def scale_and_crop_thumbnail(image, size=THUMBNAIL_SIZE, crop="smart", **kwargs) return scale_and_crop(image, size, crop=crop, upscale=True, **kwargs) - # THUMBNAILS FOR CONTENT KINDS ################################################################################ + def create_image_from_epub(epubfile, fpath_out, crop=None): """ Generate a thumbnail image from `epubfile` and save it to `fpath_out`. @@ -45,10 +43,10 @@ def create_image_from_epub(epubfile, fpath_out, crop=None): book = ebooklib.epub.read_epub(epubfile) # 1. try to get cover image from book metadata (content.opf) cover_item = None - covers = book.get_metadata('http://www.idpf.org/2007/opf', 'cover') + covers = book.get_metadata("http://www.idpf.org/2007/opf", "cover") if covers: - cover_tuple = covers[0] # ~= (None, {'name':'cover', 'content':'item1'}) - cover_item_id = cover_tuple[1]['content'] + cover_tuple = covers[0] # ~= (None, {'name':'cover', 'content':'item1'}) + cover_item_id = cover_tuple[1]["content"] for item in book.items: if item.id == cover_item_id: cover_item = item @@ -58,7 +56,9 @@ def create_image_from_epub(epubfile, fpath_out, crop=None): # 2. fallback to get first image in the ePub file images = list(book.get_items_of_type(ebooklib.ITEM_IMAGE)) if not images: - raise ThumbnailGenerationError("ePub file {} contains no images.".format(epubfile)) + raise ThumbnailGenerationError( + "ePub file {} contains no images.".format(epubfile) + ) # TODO: get largest image of the bunch image_data = BytesIO(images[0].get_content()) @@ -78,9 +78,9 @@ def create_image_from_zip(htmlfile, fpath_out, crop="smart"): biggest_name = None size = 0 try: - with zipfile.ZipFile(htmlfile, 'r') as zf: + with zipfile.ZipFile(htmlfile, "r") as zf: # get the biggest (most pixels) image in the zip - image_exts = ['png', 'PNG', 'jpeg', 'JPEG', 'jpg', 'JPG'] + image_exts = ["png", "PNG", "jpeg", "JPEG", "jpg", "JPG"] for filename in zf.namelist(): _, dotext = os.path.splitext(filename) ext = dotext[1:] @@ -94,7 +94,9 @@ def create_image_from_zip(htmlfile, fpath_out, crop="smart"): biggest_name = filename size = img_size if biggest_name is None: - raise ThumbnailGenerationError("HTML5 zip file {} contains no images.".format(htmlfile)) + raise ThumbnailGenerationError( + "HTML5 zip file {} contains no images.".format(htmlfile) + ) with zf.open(biggest_name) as fhandle: image_data = fhandle.read() with BytesIO(image_data) as bhandle: @@ -110,12 +112,14 @@ def create_image_from_pdf_page(fpath_in, fpath_out, page_number=0, crop=None): Create an image from the pdf at fpath_in and write result to fpath_out. """ try: - assert fpath_in.endswith('pdf'), "File must be in pdf format" - pages = convert_from_path(fpath_in, 500, first_page=page_number, last_page=page_number+1) + assert fpath_in.endswith("pdf"), "File must be in pdf format" + pages = convert_from_path( + fpath_in, 500, first_page=page_number, last_page=page_number + 1 + ) page = pages[0] # resize page = scale_and_crop_thumbnail(page, zoom=10, crop=crop) - page.save(fpath_out, 'PNG') + page.save(fpath_out, "PNG") except Exception as e: raise ThumbnailGenerationError("Fail on PDF {} {}".format(fpath_in, e)) @@ -123,33 +127,38 @@ def create_image_from_pdf_page(fpath_in, fpath_out, page_number=0, crop=None): # TILED THUMBNAILS FOR TOPIC NODES (FOLDERS) ################################################################################ + def create_tiled_image(source_images, fpath_out): """ Create a 16:9 tiled image from list of image paths provided in source_images and write result to fpath_out. """ try: - sizes = {1:1, 4:2, 9:3, 16:4, 25:5, 36:6, 49:7} - assert len(source_images) in sizes.keys(), "Number of images must be a perfect square <= 49" + sizes = {1: 1, 4: 2, 9: 3, 16: 4, 25: 5, 36: 6, 49: 7} + assert ( + len(source_images) in sizes.keys() + ), "Number of images must be a perfect square <= 49" root = sizes[len(source_images)] images = list(map(Image.open, source_images)) - new_im = Image.new('RGBA', THUMBNAIL_SIZE) - offset = (int(float(THUMBNAIL_SIZE[0]) / float(root)), - int(float(THUMBNAIL_SIZE[1]) / float(root)) ) + new_im = Image.new("RGBA", THUMBNAIL_SIZE) + offset = ( + int(float(THUMBNAIL_SIZE[0]) / float(root)), + int(float(THUMBNAIL_SIZE[1]) / float(root)), + ) index = 0 for y_index in range(root): for x_index in range(root): im = scale_and_crop_thumbnail(images[index], size=offset) - new_im.paste(im, (int(offset[0]*x_index), int(offset[1]*y_index))) + new_im.paste(im, (int(offset[0] * x_index), int(offset[1] * y_index))) index = index + 1 new_im.save(fpath_out) except Exception as e: raise ThumbnailGenerationError("Failed due to {}".format(e)) -def convert_image(filename, dest_dir=None, size=None, format='PNG'): +def convert_image(filename, dest_dir=None, size=None, format="PNG"): """ Converts an image to a specified output format. The converted image will have the same file basename as filename, but with the extension of the converted format. @@ -162,7 +171,9 @@ def convert_image(filename, dest_dir=None, size=None, format='PNG'): :returns: Path to converted file. """ - assert os.path.exists(filename), "Image file not found: {}".format(os.path.abspath(filename)) + assert os.path.exists(filename), "Image file not found: {}".format( + os.path.abspath(filename) + ) if not dest_dir: dest_dir = os.path.dirname(os.path.abspath(filename)) @@ -187,8 +198,10 @@ def convert_image(filename, dest_dir=None, size=None, format='PNG'): # EXCEPTIONS ################################################################################ + class ThumbnailGenerationError(Exception): """ Custom error returned when thumbnail extraction process fails. """ + pass diff --git a/ricecooker/utils/jsontrees.py b/ricecooker/utils/jsontrees.py index a864609e..f8abc66a 100644 --- a/ricecooker/utils/jsontrees.py +++ b/ricecooker/utils/jsontrees.py @@ -1,17 +1,20 @@ import json import os -from ricecooker.classes import files, nodes, questions +from le_utils.constants import content_kinds +from le_utils.constants import roles + +from ricecooker.classes import files +from ricecooker.classes import nodes +from ricecooker.classes import questions from ricecooker.classes.licenses import get_license -from ricecooker.config import LOGGER -from ricecooker.exceptions import UnknownFileTypeError, UnknownQuestionTypeError from ricecooker.classes.nodes import ChannelNode +from ricecooker.config import LOGGER +from ricecooker.exceptions import UnknownFileTypeError +from ricecooker.exceptions import UnknownQuestionTypeError # CONSTANTS USED TO SELECT APPROPRIATE CLASS DURING DESERIALIZATION FROM JSON ################################################################################ -from le_utils.constants import roles - -from le_utils.constants import content_kinds TOPIC_NODE = content_kinds.TOPIC VIDEO_NODE = content_kinds.VIDEO @@ -46,6 +49,7 @@ # JSON READ/WRITE HELPERS ################################################################################ + def read_tree_from_json(srcpath): """ Load ricecooker json tree data from json file at `srcpath`. @@ -53,7 +57,7 @@ def read_tree_from_json(srcpath): with open(srcpath) as infile: json_tree = json.load(infile) if json_tree is None: - raise ValueError('Could not find ricecooker json tree') + raise ValueError("Could not find ricecooker json tree") return json_tree @@ -64,25 +68,26 @@ def write_tree_to_json_tree(destpath, json_tree): parent_dir, _ = os.path.split(destpath) if not os.path.exists(parent_dir): os.makedirs(parent_dir, exist_ok=True) - with open(destpath, 'w', encoding='utf8') as json_file: + with open(destpath, "w", encoding="utf8") as json_file: json.dump(json_tree, json_file, indent=2, ensure_ascii=False) # CONSTRUCT CHANNEL FROM RICECOOKER JSON TREE ################################################################################ + def get_channel_node_from_json(json_tree): """ Build `ChannelNode` from json data provided in `json_tree`. """ channel = ChannelNode( - title=json_tree['title'], - description=json_tree['description'], - source_domain=json_tree['source_domain'], - source_id=json_tree['source_id'], - language=json_tree['language'], - tagline=json_tree.get('tagline', None), - thumbnail=json_tree.get('thumbnail', None), + title=json_tree["title"], + description=json_tree["description"], + source_domain=json_tree["source_domain"], + source_id=json_tree["source_id"], + language=json_tree["language"], + tagline=json_tree.get("tagline", None), + thumbnail=json_tree.get("thumbnail", None), ) return channel @@ -92,262 +97,264 @@ def build_tree_from_json(parent_node, sourcetree): Recusively parse nodes in the list `sourcetree` and add them as children to the `parent_node`. Usually called with `parent_node` being a `ChannelNode`. """ - EXPECTED_NODE_TYPES = [TOPIC_NODE, VIDEO_NODE, AUDIO_NODE, EXERCISE_NODE, - DOCUMENT_NODE, HTML5_NODE, SLIDESHOW_NODE] + EXPECTED_NODE_TYPES = [ + TOPIC_NODE, + VIDEO_NODE, + AUDIO_NODE, + EXERCISE_NODE, + DOCUMENT_NODE, + HTML5_NODE, + SLIDESHOW_NODE, + ] for source_node in sourcetree: - kind = source_node['kind'] + kind = source_node["kind"] if kind not in EXPECTED_NODE_TYPES: - LOGGER.critical('Unexpected node kind found: ' + kind) - raise NotImplementedError('Unexpected node kind found in json data.') + LOGGER.critical("Unexpected node kind found: " + kind) + raise NotImplementedError("Unexpected node kind found in json data.") if kind == TOPIC_NODE: child_node = nodes.TopicNode( - source_id=source_node.get('source_id', None), - title=source_node['title'], - description=source_node.get('description'), - author=source_node.get('author'), - aggregator=source_node.get('aggregator'), - provider=source_node.get('provider'), + source_id=source_node.get("source_id", None), + title=source_node["title"], + description=source_node.get("description"), + author=source_node.get("author"), + aggregator=source_node.get("aggregator"), + provider=source_node.get("provider"), # no role for topics (computed dynaically from descendants) - language=source_node.get('language'), - thumbnail=source_node.get('thumbnail'), - derive_thumbnail=source_node.get('derive_thumbnail', False), - tags=source_node.get('tags'), + language=source_node.get("language"), + thumbnail=source_node.get("thumbnail"), + derive_thumbnail=source_node.get("derive_thumbnail", False), + tags=source_node.get("tags"), ) parent_node.add_child(child_node) - source_tree_children = source_node.get('children', []) + source_tree_children = source_node.get("children", []) build_tree_from_json(child_node, source_tree_children) elif kind == VIDEO_NODE: child_node = nodes.VideoNode( - source_id=source_node['source_id'], - title=source_node['title'], - description=source_node.get('description'), - license=get_license(**source_node['license']), - author=source_node.get('author'), - aggregator=source_node.get('aggregator'), - provider=source_node.get('provider'), - role=source_node.get('role', roles.LEARNER), - language=source_node.get('language'), - thumbnail=source_node.get('thumbnail'), - derive_thumbnail=source_node.get('derive_thumbnail', False), - tags=source_node.get('tags'), + source_id=source_node["source_id"], + title=source_node["title"], + description=source_node.get("description"), + license=get_license(**source_node["license"]), + author=source_node.get("author"), + aggregator=source_node.get("aggregator"), + provider=source_node.get("provider"), + role=source_node.get("role", roles.LEARNER), + language=source_node.get("language"), + thumbnail=source_node.get("thumbnail"), + derive_thumbnail=source_node.get("derive_thumbnail", False), + tags=source_node.get("tags"), ) - add_files(child_node, source_node.get('files') or []) + add_files(child_node, source_node.get("files") or []) parent_node.add_child(child_node) elif kind == AUDIO_NODE: child_node = nodes.AudioNode( - source_id=source_node['source_id'], - title=source_node['title'], - description=source_node.get('description'), - license=get_license(**source_node['license']), - author=source_node.get('author'), - aggregator=source_node.get('aggregator'), - provider=source_node.get('provider'), - role=source_node.get('role', roles.LEARNER), - language=source_node.get('language'), - thumbnail=source_node.get('thumbnail'), - derive_thumbnail=source_node.get('derive_thumbnail', False), - tags=source_node.get('tags'), + source_id=source_node["source_id"], + title=source_node["title"], + description=source_node.get("description"), + license=get_license(**source_node["license"]), + author=source_node.get("author"), + aggregator=source_node.get("aggregator"), + provider=source_node.get("provider"), + role=source_node.get("role", roles.LEARNER), + language=source_node.get("language"), + thumbnail=source_node.get("thumbnail"), + derive_thumbnail=source_node.get("derive_thumbnail", False), + tags=source_node.get("tags"), ) - add_files(child_node, source_node.get('files') or []) + add_files(child_node, source_node.get("files") or []) parent_node.add_child(child_node) elif kind == EXERCISE_NODE: child_node = nodes.ExerciseNode( - source_id=source_node['source_id'], - title=source_node['title'], - description=source_node.get('description'), - license=get_license(**source_node['license']), - author=source_node.get('author'), - aggregator=source_node.get('aggregator'), - provider=source_node.get('provider'), - role=source_node.get('role', roles.LEARNER), - language=source_node.get('language'), - thumbnail=source_node.get('thumbnail'), - derive_thumbnail=source_node.get('derive_thumbnail', False), # not supported yet - tags=source_node.get('tags'), - exercise_data=source_node.get('exercise_data'), + source_id=source_node["source_id"], + title=source_node["title"], + description=source_node.get("description"), + license=get_license(**source_node["license"]), + author=source_node.get("author"), + aggregator=source_node.get("aggregator"), + provider=source_node.get("provider"), + role=source_node.get("role", roles.LEARNER), + language=source_node.get("language"), + thumbnail=source_node.get("thumbnail"), + derive_thumbnail=source_node.get( + "derive_thumbnail", False + ), # not supported yet + tags=source_node.get("tags"), + exercise_data=source_node.get("exercise_data"), questions=[], ) - add_questions(child_node, source_node.get('questions') or []) + add_questions(child_node, source_node.get("questions") or []) parent_node.add_child(child_node) elif kind == DOCUMENT_NODE: child_node = nodes.DocumentNode( - source_id=source_node['source_id'], - title=source_node['title'], - description=source_node.get('description'), - license=get_license(**source_node['license']), - author=source_node.get('author'), - aggregator=source_node.get('aggregator'), - provider=source_node.get('provider'), - role=source_node.get('role', roles.LEARNER), - language=source_node.get('language'), - thumbnail=source_node.get('thumbnail'), - tags=source_node.get('tags'), + source_id=source_node["source_id"], + title=source_node["title"], + description=source_node.get("description"), + license=get_license(**source_node["license"]), + author=source_node.get("author"), + aggregator=source_node.get("aggregator"), + provider=source_node.get("provider"), + role=source_node.get("role", roles.LEARNER), + language=source_node.get("language"), + thumbnail=source_node.get("thumbnail"), + tags=source_node.get("tags"), ) - add_files(child_node, source_node.get('files') or []) + add_files(child_node, source_node.get("files") or []) parent_node.add_child(child_node) elif kind == HTML5_NODE: child_node = nodes.HTML5AppNode( - source_id=source_node['source_id'], - title=source_node['title'], - description=source_node.get('description'), - license=get_license(**source_node['license']), - author=source_node.get('author'), - aggregator=source_node.get('aggregator'), - provider=source_node.get('provider'), - role=source_node.get('role', roles.LEARNER), - language=source_node.get('language'), - thumbnail=source_node.get('thumbnail'), - derive_thumbnail=source_node.get('derive_thumbnail', False), - tags=source_node.get('tags'), + source_id=source_node["source_id"], + title=source_node["title"], + description=source_node.get("description"), + license=get_license(**source_node["license"]), + author=source_node.get("author"), + aggregator=source_node.get("aggregator"), + provider=source_node.get("provider"), + role=source_node.get("role", roles.LEARNER), + language=source_node.get("language"), + thumbnail=source_node.get("thumbnail"), + derive_thumbnail=source_node.get("derive_thumbnail", False), + tags=source_node.get("tags"), ) - add_files(child_node, source_node.get('files') or []) + add_files(child_node, source_node.get("files") or []) parent_node.add_child(child_node) elif kind == SLIDESHOW_NODE: child_node = nodes.SlideshowNode( - source_id=source_node['source_id'], - title=source_node['title'], - description=source_node.get('description'), - license=get_license(**source_node['license']), - author=source_node.get('author'), - aggregator=source_node.get('aggregator'), - provider=source_node.get('provider'), - role=source_node.get('role', roles.LEARNER), - language=source_node.get('language'), - thumbnail=source_node.get('thumbnail'), - derive_thumbnail=source_node.get('derive_thumbnail', False), - tags=source_node.get('tags'), - + source_id=source_node["source_id"], + title=source_node["title"], + description=source_node.get("description"), + license=get_license(**source_node["license"]), + author=source_node.get("author"), + aggregator=source_node.get("aggregator"), + provider=source_node.get("provider"), + role=source_node.get("role", roles.LEARNER), + language=source_node.get("language"), + thumbnail=source_node.get("thumbnail"), + derive_thumbnail=source_node.get("derive_thumbnail", False), + tags=source_node.get("tags"), ) - add_files(child_node, source_node.get('files') or []) + add_files(child_node, source_node.get("files") or []) parent_node.add_child(child_node) # TODO: add support for H5P content kind else: - LOGGER.critical('Encountered an unknown kind: ' + str(source_node)) + LOGGER.critical("Encountered an unknown kind: " + str(source_node)) continue return parent_node def add_files(node, file_list): - EXPECTED_FILE_TYPES = [VIDEO_FILE, AUDIO_FILE, DOCUMENT_FILE, EPUB_FILE, - HTML5_FILE, THUMBNAIL_FILE, SUBTITLES_FILE, SLIDESHOW_IMAGE_FILE] + EXPECTED_FILE_TYPES = [ + VIDEO_FILE, + AUDIO_FILE, + DOCUMENT_FILE, + EPUB_FILE, + HTML5_FILE, + THUMBNAIL_FILE, + SUBTITLES_FILE, + SLIDESHOW_IMAGE_FILE, + ] for f in file_list: - file_type = f.get('file_type') + file_type = f.get("file_type") if file_type not in EXPECTED_FILE_TYPES: LOGGER.critical(file_type) - raise NotImplementedError('Unexpected File type found in channel json.') + raise NotImplementedError("Unexpected File type found in channel json.") - path = f.get('path') # path can be an URL or a local path (or None) - preset = f.get('preset', None) + path = f.get("path") # path can be an URL or a local path (or None) + preset = f.get("preset", None) # handle different types of files if file_type == VIDEO_FILE: # handle three types of video files - if 'youtube_id' in f: + if "youtube_id" in f: video_file = files.YouTubeVideoFile( - youtube_id=f['youtube_id'], - download_settings=f.get('download_settings', None), - high_resolution=f.get('high_resolution', False), - maxheight=f.get('maxheight', None), - language=f.get('language', None), - preset=preset + youtube_id=f["youtube_id"], + download_settings=f.get("download_settings", None), + high_resolution=f.get("high_resolution", False), + maxheight=f.get("maxheight", None), + language=f.get("language", None), + preset=preset, ) - elif 'web_url' in f: + elif "web_url" in f: video_file = files.WebVideoFile( - web_url=f['web_url'], - download_settings=f.get('download_settings', None), - high_resolution=f.get('high_resolution', False), - maxheight=f.get('maxheight', None), - language=f.get('language', None), - preset=preset + web_url=f["web_url"], + download_settings=f.get("download_settings", None), + high_resolution=f.get("high_resolution", False), + maxheight=f.get("maxheight", None), + language=f.get("language", None), + preset=preset, ) else: video_file = files.VideoFile( - path=f['path'], - language=f.get('language', None), - ffmpeg_settings=f.get('ffmpeg_settings'), - + path=f["path"], + language=f.get("language", None), + ffmpeg_settings=f.get("ffmpeg_settings"), ) node.add_file(video_file) elif file_type == AUDIO_FILE: node.add_file( files.AudioFile( - path=f['path'], - language=f.get('language', None), - preset=preset - + path=f["path"], language=f.get("language", None), preset=preset ) ) elif file_type == DOCUMENT_FILE: node.add_file( files.DocumentFile( - path=path, - language=f.get('language', None), - preset=preset - + path=path, language=f.get("language", None), preset=preset ) ) elif file_type == EPUB_FILE: node.add_file( files.EPubFile( - path=path, - language=f.get('language', None), - preset=preset - + path=path, language=f.get("language", None), preset=preset ) ) elif file_type == HTML5_FILE: node.add_file( files.HTMLZipFile( - path=path, - language=f.get('language', None), - preset=preset - + path=path, language=f.get("language", None), preset=preset ) ) elif file_type == THUMBNAIL_FILE: - if 'encoding' in f: + if "encoding" in f: node.add_file( files.Base64ImageFile( - encoding=f['encoding'], + encoding=f["encoding"], ) ) else: node.add_file( files.ThumbnailFile( path=path, - language=f.get('language', None), + language=f.get("language", None), ) ) elif file_type == SUBTITLES_FILE: - if 'youtube_id' in f: + if "youtube_id" in f: node.add_file( files.YouTubeSubtitleFile( - youtube_id=f['youtube_id'], - language=f['language'] + youtube_id=f["youtube_id"], language=f["language"] ) ) else: - keys = ['language', 'subtitlesformat'] - params = {'path': path} + keys = ["language", "subtitlesformat"] + params = {"path": path} for key in keys: if key in f: params[key] = f[key] @@ -357,68 +364,75 @@ def add_files(node, file_list): node.add_file( files.SlideImageFile( path=path, - language=f.get('language', None), - caption=f.get('caption', ''), - descriptive_text=f.get('descriptive_text', '') + language=f.get("language", None), + caption=f.get("caption", ""), + descriptive_text=f.get("descriptive_text", ""), ) ) else: - raise UnknownFileTypeError('Unrecognized file type "{0}"'.format(f['path'])) + raise UnknownFileTypeError('Unrecognized file type "{0}"'.format(f["path"])) def add_questions(exercise_node, question_list): - EXPECTED_QUESTION_TYPES = [INPUT_QUESTION, MULTIPLE_SELECTION, SINGLE_SELECTION, - FREE_RESPONSE, PERSEUS_QUESTION] + EXPECTED_QUESTION_TYPES = [ + INPUT_QUESTION, + MULTIPLE_SELECTION, + SINGLE_SELECTION, + FREE_RESPONSE, + PERSEUS_QUESTION, + ] for q in question_list: - question_type = q.get('question_type') + question_type = q.get("question_type") if question_type not in EXPECTED_QUESTION_TYPES: LOGGER.critical(question_type) - raise NotImplementedError('Unexpected question type found in channel json.') + raise NotImplementedError("Unexpected question type found in channel json.") - question_text = q.get('question') - hints = q.get('hints') + question_text = q.get("question") + hints = q.get("hints") hints = hints if isinstance(hints, str) else [hint for hint in hints or []] if question_type == exercises.MULTIPLE_SELECTION: q_obj = questions.MultipleSelectQuestion( - id=q['id'], + id=q["id"], question=question_text, - correct_answers=[answer for answer in q['correct_answers']], - all_answers=[answer for answer in q['all_answers']], + correct_answers=[answer for answer in q["correct_answers"]], + all_answers=[answer for answer in q["all_answers"]], hints=hints, ) exercise_node.add_question(q_obj) elif question_type == exercises.SINGLE_SELECTION: q_obj = questions.SingleSelectQuestion( - id=q['id'], + id=q["id"], question=question_text, - correct_answer=q['correct_answer'], - all_answers=[answer for answer in q['all_answers']], + correct_answer=q["correct_answer"], + all_answers=[answer for answer in q["all_answers"]], hints=hints, ) exercise_node.add_question(q_obj) elif question_type == exercises.INPUT_QUESTION: q_obj = questions.InputQuestion( - id=q['id'], + id=q["id"], question=question_text, - answers=[answer for answer in q['answers']], + answers=[answer for answer in q["answers"]], hints=hints, ) exercise_node.add_question(q_obj) elif question_type == exercises.PERSEUS_QUESTION: q_obj = questions.PerseusQuestion( - id=q['id'], - raw_data=q.get('item_data'), - source_url=q.get('source_url') or 'https://www.khanacademy.org/', + id=q["id"], + raw_data=q.get("item_data"), + source_url=q.get("source_url") or "https://www.khanacademy.org/", ) exercise_node.add_question(q_obj) else: raise UnknownQuestionTypeError( - 'Unrecognized question type {0}: accepted types are {1}'.format(question_type, [key for key, value in - exercises.question_choices])) + "Unrecognized question type {0}: accepted types are {1}".format( + question_type, [key for key, value in exercises.question_choices] + ) + ) diff --git a/ricecooker/utils/kolibripreview.py b/ricecooker/utils/kolibripreview.py index c17b55fe..5e312432 100755 --- a/ricecooker/utils/kolibripreview.py +++ b/ricecooker/utils/kolibripreview.py @@ -9,9 +9,9 @@ def validate(srcdir): """ Check if `srcdir` has an index.html in it. """ - indexpath = os.path.join(srcdir, 'index.html') + indexpath = os.path.join(srcdir, "index.html") if not os.path.exists(indexpath): - print('Missing index.html file in', srcdir) + print("Missing index.html file in", srcdir) return False return True @@ -21,20 +21,25 @@ def main(args): Command line utility for previewing HTML5App content in Kolbri. """ if not os.path.exists(args.srcdir) or not os.path.isdir(args.srcdir): - print('Error:', args.srcdir, 'is not a directory.') + print("Error:", args.srcdir, "is not a directory.") sys.exit(1) if not validate(args.srcdir): - print('Validation failed; exiting.') + print("Validation failed; exiting.") sys.exit(2) # Write the contents of `srcdir` to `destzip` destzipbase, _ = os.path.splitext(args.destzip) - shutil.make_archive(destzipbase, 'zip', args.srcdir) + shutil.make_archive(destzipbase, "zip", args.srcdir) -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser(description=main.__doc__) - parser.add_argument('--srcdir', help='HTML5 webroot (source directory)', default='.') - parser.add_argument('--destzip', help='Path to a HTML5 zip file in local Kolibri installation', required=True) + parser.add_argument( + "--srcdir", help="HTML5 webroot (source directory)", default="." + ) + parser.add_argument( + "--destzip", + help="Path to a HTML5 zip file in local Kolibri installation", + required=True, + ) args = parser.parse_args() main(args) - diff --git a/ricecooker/utils/libstudio.py b/ricecooker/utils/libstudio.py index 587ba9ff..fd5c7787 100644 --- a/ricecooker/utils/libstudio.py +++ b/ricecooker/utils/libstudio.py @@ -1,10 +1,11 @@ import requests + from ricecooker.config import LOGGER # DEFAULT_STUDIO_URL = 'https://develop.studio.learningequality.org' # DEFAULT_STUDIO_URL = 'http://127.0.0.1:8080' -DEFAULT_STUDIO_URL = 'https://studio.learningequality.org' +DEFAULT_STUDIO_URL = "https://studio.learningequality.org" # TODO https://studio.learningequality.org/api/get_node_path/ca8f380/18932/41b2549 @@ -18,8 +19,10 @@ class StudioApi(object): corrections, and other automation. """ - def __init__(self, token, username=None, password=None, studio_url=DEFAULT_STUDIO_URL): - self.studio_url = studio_url.rstrip('/') + def __init__( + self, token, username=None, password=None, studio_url=DEFAULT_STUDIO_URL + ): + self.studio_url = studio_url.rstrip("/") self.token = token self.licenses_by_id = self.get_licenses() if username and password: @@ -28,10 +31,10 @@ def __init__(self, token, username=None, password=None, studio_url=DEFAULT_STUDI self.session = None def _create_logged_in_session(self, username, password): - LOGIN_ENDPOINT = self.studio_url + '/accounts/login/' + LOGIN_ENDPOINT = self.studio_url + "/accounts/login/" session = requests.session() session.headers.update({"referer": self.studio_url}) - session.headers.update({'User-Agent': 'Mozilla/5.0 Firefox/63.0'}) + session.headers.update({"User-Agent": "Mozilla/5.0 Firefox/63.0"}) session.get(LOGIN_ENDPOINT) csrftoken = session.cookies.get("csrftoken") session.headers.update({"csrftoken": csrftoken}) @@ -39,13 +42,12 @@ def _create_logged_in_session(self, username, password): post_data = { "csrfmiddlewaretoken": csrftoken, "username": username, - "password": password + "password": password, } response2 = session.post(LOGIN_ENDPOINT, data=post_data) - assert response2.status_code == 200, 'Login POST failed' + assert response2.status_code == 200, "Login POST failed" return session - def get_channel(self, channel_id): """ Calls the /api/channel/{{channel_id}} endpoint to get the channel info. @@ -58,44 +60,42 @@ def get_channel(self, channel_id): created this channel. If `Null` this means it's a manually uploaded channel or a derivative channel """ - CHANNEL_ENDPOINT = self.studio_url + '/api/channel/' + CHANNEL_ENDPOINT = self.studio_url + "/api/channel/" # TODO: add TokenAuth to this entpoint so can use without session login # headers = {"Authorization": "Token {0}".format(self.token)} url = CHANNEL_ENDPOINT + channel_id - LOGGER.info(' GET ' + url) + LOGGER.info(" GET " + url) response = self.session.get(url) channel_data = response.json() return channel_data - def get_channel_root_studio_id(self, channel_id, tree='main'): + def get_channel_root_studio_id(self, channel_id, tree="main"): """ Return the `studio_id` for the root of the tree `tree` for `channel_id`. """ channel_data = self.get_channel(channel_id) - tree_key = tree + '_tree' + tree_key = tree + "_tree" tree_data = channel_data[tree_key] - return tree_data['id'] - + return tree_data["id"] def get_licenses(self): - LICENSES_LIST_ENDPOINT = self.studio_url + '/api/license' + LICENSES_LIST_ENDPOINT = self.studio_url + "/api/license" headers = {"Authorization": "Token {0}".format(self.token)} response = requests.get(LICENSES_LIST_ENDPOINT, headers=headers) licenses_list = response.json() licenses_dict = {} for license in licenses_list: - licenses_dict[license['id']] = license + licenses_dict[license["id"]] = license return licenses_dict - def get_nodes_by_ids_complete(self, studio_id): """ Get the complete JSON representation of a content node from the Studio API. """ - NODES_ENDPOINT = self.studio_url + '/api/get_nodes_by_ids_complete/' + NODES_ENDPOINT = self.studio_url + "/api/get_nodes_by_ids_complete/" headers = {"Authorization": "Token {0}".format(self.token)} url = NODES_ENDPOINT + studio_id - LOGGER.info(' GET ' + url) + LOGGER.info(" GET " + url) response = requests.get(url, headers=headers) studio_node = response.json()[0] return studio_node @@ -106,20 +106,23 @@ def get_nodes_by_ids_bulk(self, studio_ids): content node data in chunks of 10 from the Studio API. """ CHUNK_SIZE = 25 - NODES_ENDPOINT = self.studio_url + '/api/get_nodes_by_ids_complete/' + NODES_ENDPOINT = self.studio_url + "/api/get_nodes_by_ids_complete/" headers = {"Authorization": "Token {0}".format(self.token)} studio_nodes = [] - studio_ids_chunks = [studio_ids[i:i+CHUNK_SIZE] for i in range(0, len(studio_ids), CHUNK_SIZE)] + studio_ids_chunks = [ + studio_ids[i : i + CHUNK_SIZE] + for i in range(0, len(studio_ids), CHUNK_SIZE) + ] for studio_ids_chunk in studio_ids_chunks: - studio_ids_csv = ','.join(studio_ids_chunk) + studio_ids_csv = ",".join(studio_ids_chunk) url = NODES_ENDPOINT + studio_ids_csv - LOGGER.info(' GET ' + url) + LOGGER.info(" GET " + url) response = requests.get(url, headers=headers) chunk_nodes = response.json() for chunk_node in chunk_nodes: - if 'children' in chunk_node: - child_nodes = self.get_nodes_by_ids_bulk(chunk_node['children']) - chunk_node['children'] = child_nodes + if "children" in chunk_node: + child_nodes = self.get_nodes_by_ids_bulk(chunk_node["children"]) + chunk_node["children"] = child_nodes studio_nodes.extend(chunk_nodes) return studio_nodes @@ -128,13 +131,12 @@ def get_tree_for_studio_id(self, studio_id): Returns the full json tree (recusive calls to /api/get_nodes_by_ids_complete) """ channel_root = self.get_nodes_by_ids_complete(studio_id) - if 'children' in channel_root: - children_refs = channel_root['children'] + if "children" in channel_root: + children_refs = channel_root["children"] studio_nodes = self.get_nodes_by_ids_bulk(children_refs) - channel_root['children'] = studio_nodes + channel_root["children"] = studio_nodes return channel_root - def get_contentnode(self, studio_id): """ Return the `studio_id` for the root of the tree `tree` for `channel_id`. @@ -145,9 +147,11 @@ def put_contentnode(self, data): """ Send a PUT requests to /api/contentnode to update Studio node to data. """ - CONTENTNODE_ENDPOINT = self.studio_url + '/api/contentnode' - REQUIRED_FIELDS = ['id', 'tags', 'prerequisite', 'parent'] - assert data_has_required_keys(data, REQUIRED_FIELDS), 'missing necessary attributes' + CONTENTNODE_ENDPOINT = self.studio_url + "/api/contentnode" + REQUIRED_FIELDS = ["id", "tags", "prerequisite", "parent"] + assert data_has_required_keys( + data, REQUIRED_FIELDS + ), "missing necessary attributes" # studio_id = data['id'] url = CONTENTNODE_ENDPOINT # print(' semantic PATCH using PUT ' + url) @@ -164,16 +168,18 @@ def delete_contentnode(self, data, channel_id, trash_studio_id=None): can provide `trash_studio_id` which is the studio id the trash tree for the channel. """ - MOVE_NODES_ENDPOINT = self.studio_url + '/api/move_nodes/' - REQUIRED_FIELDS = ['id'] - assert data_has_required_keys(data, REQUIRED_FIELDS), 'missing necessary attributes' + MOVE_NODES_ENDPOINT = self.studio_url + "/api/move_nodes/" + REQUIRED_FIELDS = ["id"] + assert data_has_required_keys( + data, REQUIRED_FIELDS + ), "missing necessary attributes" if trash_studio_id is None: channel_data = self.get_channel(channel_id) - trash_studio_id = channel_data['trash_tree']['id'] + trash_studio_id = channel_data["trash_tree"]["id"] post_data = { - 'nodes': [data], - 'target_parent': trash_studio_id, - 'channel_id': channel_id, + "nodes": [data], + "target_parent": trash_studio_id, + "channel_id": channel_id, } url = MOVE_NODES_ENDPOINT # print(' semantic DELETE using POST to ' + url) @@ -188,13 +194,13 @@ def copy_contentnode(self, data, target_parent, channel_id): Send a POST requests to /api/duplicate_node_inline/ to copy node `data` to the target parent folder `target_parent` in channel `channel_id`. """ - DUPLICATE_NODE_INLINE_ENDPOINT = self.studio_url + '/api/duplicate_nodes/' - REQUIRED_FIELDS = ['id'] - assert data_has_required_keys(data, REQUIRED_FIELDS), 'no studio_id in data' + DUPLICATE_NODE_INLINE_ENDPOINT = self.studio_url + "/api/duplicate_nodes/" + REQUIRED_FIELDS = ["id"] + assert data_has_required_keys(data, REQUIRED_FIELDS), "no studio_id in data" post_data = { - 'node_ids': [data['id']], - 'target_parent': target_parent, - 'channel_id': channel_id, + "node_ids": [data["id"]], + "target_parent": target_parent, + "channel_id": channel_id, } url = DUPLICATE_NODE_INLINE_ENDPOINT # print(' semantic COPY using POST to ' + url) @@ -205,17 +211,9 @@ def copy_contentnode(self, data, target_parent, channel_id): return copied_data_list - def data_has_required_keys(data, required_keys): verdict = True for key in required_keys: if key not in data: verdict = False return verdict - - - - - - - diff --git a/ricecooker/utils/linecook.py b/ricecooker/utils/linecook.py index c82b35bc..221c9b66 100644 --- a/ricecooker/utils/linecook.py +++ b/ricecooker/utils/linecook.py @@ -1,28 +1,46 @@ import argparse import os -from ricecooker.config import LOGGER from le_utils.constants import content_kinds -from .metadata_provider import path_to_tuple -from .jsontrees import (TOPIC_NODE, VIDEO_NODE, AUDIO_NODE, EXERCISE_NODE, - DOCUMENT_NODE, HTML5_NODE) -from .jsontrees import (VIDEO_FILE, AUDIO_FILE, DOCUMENT_FILE, EPUB_FILE, HTML5_FILE, - THUMBNAIL_FILE, SUBTITLES_FILE) + +from .jsontrees import AUDIO_FILE +from .jsontrees import AUDIO_NODE +from .jsontrees import DOCUMENT_FILE +from .jsontrees import DOCUMENT_NODE +from .jsontrees import EPUB_FILE +from .jsontrees import EXERCISE_NODE +from .jsontrees import HTML5_FILE +from .jsontrees import HTML5_NODE +from .jsontrees import SUBTITLES_FILE +from .jsontrees import THUMBNAIL_FILE +from .jsontrees import TOPIC_NODE +from .jsontrees import VIDEO_FILE +from .jsontrees import VIDEO_NODE from .jsontrees import write_tree_to_json_tree +from .metadata_provider import path_to_tuple +from ricecooker.config import LOGGER # LINECOOK CONFIGS ################################################################################ DIR_EXCLUDE_PATTERNS = [] -FILE_EXCLUDE_EXTENTIONS = ['.DS_Store', 'Thumbs.db', 'ehthumbs.db', 'ehthumbs_vista.db', '.gitkeep'] +FILE_EXCLUDE_EXTENTIONS = [ + ".DS_Store", + "Thumbs.db", + "ehthumbs.db", + "ehthumbs_vista.db", + ".gitkeep", +] FILE_SKIP_PATTENRS = [] -FILE_SKIP_THUMBNAILS = [] # global list of paths that correspond to thumbails for other content nodes - +FILE_SKIP_THUMBNAILS = ( + [] +) # global list of paths that correspond to thumbails for other content nodes # LINECOOK HELPER FUNCTIONS ################################################################################ + def chan_path_from_rel_path(rel_path, channeldir): """ Convert `rel_path` form os.walk tuple format to a tuple of directories and @@ -36,13 +54,14 @@ def chan_path_from_rel_path(rel_path, channeldir): dirs_before_channeldir = channeldir.split(os.path.sep)[:-1] channel_chan_path = [] # path relative to channel root, inclusive for idx, part in enumerate(rel_path_parts): - if idx < len(dirs_before_channeldir) and dirs_before_channeldir[idx]==part: + if idx < len(dirs_before_channeldir) and dirs_before_channeldir[idx] == part: continue else: channel_chan_path.append(part) chan_path = os.path.join(*channel_chan_path) return chan_path + def rel_path_from_chan_path(chan_path, channeldir, windows=False): """ Convert `chan_path` as obtained from a metadata provider into a `rel_path` @@ -51,35 +70,41 @@ def rel_path_from_chan_path(chan_path, channeldir, windows=False): 'content/open_stax_zip/Open Stax/Math' """ if windows: - chan_path_list = chan_path.split('\\') + chan_path_list = chan_path.split("\\") else: - chan_path_list = chan_path.split('/') + chan_path_list = chan_path.split("/") chan_path_list.pop(0) # remove the channel root dir rel_path = os.path.join(channeldir, *chan_path_list) return rel_path + def get_topic_for_path(channel, chan_path_tuple): """ Given channel (dict) that contains a hierary of TopicNode dicts, we use the walk the path given in `chan_path_tuple` to find the corresponding TopicNode. """ - assert chan_path_tuple[0] == channel['dirname'], 'Wrong channeldir' + assert chan_path_tuple[0] == channel["dirname"], "Wrong channeldir" chan_path_list = list(chan_path_tuple) - chan_path_list.pop(0) # skip the channel name + chan_path_list.pop(0) # skip the channel name if len(chan_path_list) == 0: return channel current = channel for subtopic in chan_path_list: - current = list(filter(lambda d: 'dirname' in d and d['dirname'] == subtopic, current['children']))[0] + current = list( + filter( + lambda d: "dirname" in d and d["dirname"] == subtopic, + current["children"], + ) + )[0] return current - # LINECOOK BUILD JSON TREE ################################################################################ + def filter_filenames(filenames): """ Skip files with extentions in `FILE_EXCLUDE_EXTENTIONS` and filenames that @@ -91,19 +116,27 @@ def filter_filenames(filenames): for pattern in FILE_EXCLUDE_EXTENTIONS: if filename.endswith(pattern): keep = False - for pattern in FILE_SKIP_PATTENRS: # This will reject exercises... + for pattern in FILE_SKIP_PATTENRS: # This will reject exercises... if pattern in filename: keep = False if keep: filenames_cleaned.append(filename) return filenames_cleaned + def filter_thumbnail_files(chan_path, filenames, metadata_provider): """ We don't want to create `ContentNode` from thumbnail files. """ - thumbnail_files_to_skip = set(os.path.join(*p) for p in metadata_provider.get_thumbnail_paths()) - return [filename for filename in filenames if os.path.join(chan_path, filename) not in thumbnail_files_to_skip] + thumbnail_files_to_skip = set( + os.path.join(*p) for p in metadata_provider.get_thumbnail_paths() + ) + return [ + filename + for filename in filenames + if os.path.join(chan_path, filename) not in thumbnail_files_to_skip + ] + def keep_folder(raw_path): """ @@ -112,78 +145,93 @@ def keep_folder(raw_path): keep = True for pattern in DIR_EXCLUDE_PATTERNS: if pattern in raw_path: - LOGGER.debug('rejecting', raw_path) + LOGGER.debug("rejecting", raw_path) keep = False return keep + def process_folder(channel, rel_path, filenames, metadata_provider): """ Create `ContentNode`s from each file in this folder and the node to `channel` under the path `rel_path`. """ - LOGGER.debug('IN process_folder ' + str(rel_path) + ' ' + str(filenames)) + LOGGER.debug("IN process_folder " + str(rel_path) + " " + str(filenames)) if not keep_folder(rel_path): return chan_path = chan_path_from_rel_path(rel_path, metadata_provider.channeldir) chan_path_tuple = path_to_tuple(chan_path) chan_path_list = list(chan_path_tuple) - LOGGER.debug('chan_path_list=' + str(chan_path_list)) + LOGGER.debug("chan_path_list=" + str(chan_path_list)) # FIND THE CONTAINING NODE (channel or topic) if len(chan_path_list) == 1: # CASE CHANNEL ROOT: `rel_path` points to `channeldir` # No need to create a topic node here since channel already exists - containing_node = channel # attach content nodes in filenames directly to channel + containing_node = ( + channel # attach content nodes in filenames directly to channel + ) else: # CASE TOPIC FOLDER: `rel_path` points to a channelroot subfolder (a.k.a TopicNode) - dirname = chan_path_list.pop() # name of the folder (used as ID for internal lookup) + dirname = ( + chan_path_list.pop() + ) # name of the folder (used as ID for internal lookup) topic_parent_node = get_topic_for_path(channel, chan_path_list) # read topic metadata to get title and description for the TopicNode topic_metadata = metadata_provider.get(chan_path_tuple) - thumbnail_chan_path = topic_metadata.get('thumbnail_chan_path', None) + thumbnail_chan_path = topic_metadata.get("thumbnail_chan_path", None) if thumbnail_chan_path: - thumbnail_rel_path = rel_path_from_chan_path(thumbnail_chan_path, metadata_provider.channeldir) + thumbnail_rel_path = rel_path_from_chan_path( + thumbnail_chan_path, metadata_provider.channeldir + ) else: thumbnail_rel_path = None # create TopicNode for this folder topic = dict( kind=TOPIC_NODE, dirname=dirname, - source_id='sourceid:' + rel_path, - title=topic_metadata.get('title', dirname), - description=topic_metadata.get('description', None), - author=topic_metadata.get('author', None), - language=topic_metadata.get('language', None), - license=topic_metadata.get('license', None), + source_id="sourceid:" + rel_path, + title=topic_metadata.get("title", dirname), + description=topic_metadata.get("description", None), + author=topic_metadata.get("author", None), + language=topic_metadata.get("language", None), + license=topic_metadata.get("license", None), thumbnail=thumbnail_rel_path, children=[], ) - topic_parent_node['children'].append(topic) - containing_node = topic # attach content nodes in filenames to the newly created topic + topic_parent_node["children"].append(topic) + containing_node = ( + topic # attach content nodes in filenames to the newly created topic + ) # filter filenames filenames_cleaned = filter_filenames(filenames) - filenames_cleaned2 = filter_thumbnail_files(chan_path, filenames_cleaned, metadata_provider) + filenames_cleaned2 = filter_thumbnail_files( + chan_path, filenames_cleaned, metadata_provider + ) # PROCESS FILES for filename in filenames_cleaned2: chan_filepath = os.path.join(chan_path, filename) chan_filepath_tuple = path_to_tuple(chan_filepath) metadata = metadata_provider.get(chan_filepath_tuple) - node = make_content_node(metadata_provider.channeldir, rel_path, filename, metadata) - containing_node['children'].append(node) # attach content node to containing_node + node = make_content_node( + metadata_provider.channeldir, rel_path, filename, metadata + ) + containing_node["children"].append( + node + ) # attach content node to containing_node def build_ricecooker_json_tree(args, options, metadata_provider, json_tree_path): """ Download all categories, subpages, modules, and resources from open.edu. """ - LOGGER.info('Starting to build the ricecooker_json_tree') + LOGGER.info("Starting to build the ricecooker_json_tree") - channeldir = args['channeldir'] + channeldir = args["channeldir"] if channeldir.endswith(os.path.sep): channeldir.rstrip(os.path.sep) channelparentdir, channeldirname = os.path.split(channeldir) @@ -191,23 +239,25 @@ def build_ricecooker_json_tree(args, options, metadata_provider, json_tree_path) # Ricecooker tree channel_info = metadata_provider.get_channel_info() - thumbnail_chan_path = channel_info.get('thumbnail_chan_path', None) + thumbnail_chan_path = channel_info.get("thumbnail_chan_path", None) if thumbnail_chan_path: - thumbnail_rel_path = rel_path_from_chan_path(thumbnail_chan_path, metadata_provider.channeldir) + thumbnail_rel_path = rel_path_from_chan_path( + thumbnail_chan_path, metadata_provider.channeldir + ) else: thumbnail_rel_path = None ricecooker_json_tree = dict( dirname=channeldirname, - title=channel_info['title'], - description=channel_info['description'], - source_domain=channel_info['source_domain'], - source_id=channel_info['source_id'], - language=channel_info['language'], + title=channel_info["title"], + description=channel_info["description"], + source_domain=channel_info["source_domain"], + source_id=channel_info["source_id"], + language=channel_info["language"], thumbnail=thumbnail_rel_path, children=[], ) - channeldir = args['channeldir'] + channeldir = args["channeldir"] content_folders = sorted(os.walk(channeldir)) # MAIN PROCESSING OF os.walk OUTPUT @@ -215,7 +265,7 @@ def build_ricecooker_json_tree(args, options, metadata_provider, json_tree_path) # TODO(ivan): figure out all the implications of the # _ = content_folders.pop(0) # Skip over channel folder because handled above for rel_path, _subfolders, filenames in content_folders: - LOGGER.info('processing folder ' + str(rel_path)) + LOGGER.info("processing folder " + str(rel_path)) # IMPLEMENTATION DETAIL: # - `filenames` contains real files in the `channeldir` folder @@ -223,17 +273,23 @@ def build_ricecooker_json_tree(args, options, metadata_provider, json_tree_path) # order of nodes within a given topic. Since alphabetical order is used to # walk the files in the `channeldir`, we must "splice in" the exercises here if metadata_provider.has_exercises(): - dir_chan_path = chan_path_from_rel_path(rel_path, metadata_provider.channeldir) + dir_chan_path = chan_path_from_rel_path( + rel_path, metadata_provider.channeldir + ) dir_path_tuple = path_to_tuple(dir_chan_path) - exercises_filenames = metadata_provider.get_exercises_for_dir(dir_path_tuple) + exercises_filenames = metadata_provider.get_exercises_for_dir( + dir_path_tuple + ) filenames.extend(exercises_filenames) sorted_filenames = sorted(filenames) - process_folder(ricecooker_json_tree, rel_path, sorted_filenames, metadata_provider) + process_folder( + ricecooker_json_tree, rel_path, sorted_filenames, metadata_provider + ) # Write out ricecooker_json_tree.json write_tree_to_json_tree(json_tree_path, ricecooker_json_tree) - LOGGER.info('Folder hierarchy walk result stored in ' + json_tree_path) + LOGGER.info("Folder hierarchy walk result stored in " + json_tree_path) def make_content_node(channeldir, rel_path, filename, metadata): @@ -244,24 +300,28 @@ def make_content_node(channeldir, rel_path, filename, metadata): ext = file_ext[1:] kind = None if ext in content_kinds.MAPPING: - kind = content_kinds.MAPPING[ext] # guess what kind based on file extension - elif 'questions' in metadata: + kind = content_kinds.MAPPING[ext] # guess what kind based on file extension + elif "questions" in metadata: kind = content_kinds.EXERCISE else: - raise ValueError('Could not find kind for extension ' + str(ext) + ' in content_kinds.MAPPING') + raise ValueError( + "Could not find kind for extension " + + str(ext) + + " in content_kinds.MAPPING" + ) # Extract metadata fields - source_id = metadata.get('source_id', None) + source_id = metadata.get("source_id", None) if source_id is None: - source_id = metadata['chan_path'] + source_id = metadata["chan_path"] filepath = os.path.join(rel_path, filename) - title = metadata['title'] - description = metadata.get('description', None) - author = metadata.get('author', None) - lang = metadata.get('language', None) - license_dict = metadata.get('license', None) - thumbnail_chan_path = metadata.get('thumbnail_chan_path', None) + title = metadata["title"] + description = metadata.get("description", None) + author = metadata.get("author", None) + lang = metadata.get("language", None) + license_dict = metadata.get("license", None) + thumbnail_chan_path = metadata.get("thumbnail_chan_path", None) if thumbnail_chan_path: thumbnail_rel_path = rel_path_from_chan_path(thumbnail_chan_path, channeldir) else: @@ -278,7 +338,9 @@ def make_content_node(channeldir, rel_path, filename, metadata): license=license_dict, derive_thumbnail=True, thumbnail=thumbnail_rel_path, - files=[{'file_type':VIDEO_FILE, 'path':filepath, 'language':lang}], # ffmpeg_settings={"crf": 24}, + files=[ + {"file_type": VIDEO_FILE, "path": filepath, "language": lang} + ], # ffmpeg_settings={"crf": 24}, ) elif kind == AUDIO_NODE: @@ -292,7 +354,7 @@ def make_content_node(channeldir, rel_path, filename, metadata): license=license_dict, thumbnail=thumbnail_rel_path, derive_thumbnail=True, - files=[{'file_type':AUDIO_FILE, 'path':filepath, 'language':lang}], + files=[{"file_type": AUDIO_FILE, "path": filepath, "language": lang}], ) elif kind == DOCUMENT_NODE: @@ -306,24 +368,16 @@ def make_content_node(channeldir, rel_path, filename, metadata): license=license_dict, thumbnail=thumbnail_rel_path, derive_thumbnail=True, - files=[] + files=[], ) - if ext == 'pdf': - pdf_file = { - 'file_type':DOCUMENT_FILE, - 'path':filepath, - 'language':lang - } - content_node['files'].append(pdf_file) - elif ext == 'epub': - epub_file = { - 'file_type':EPUB_FILE, - 'path':filepath, - 'language':lang - } - content_node['files'].append(epub_file) + if ext == "pdf": + pdf_file = {"file_type": DOCUMENT_FILE, "path": filepath, "language": lang} + content_node["files"].append(pdf_file) + elif ext == "epub": + epub_file = {"file_type": EPUB_FILE, "path": filepath, "language": lang} + content_node["files"].append(epub_file) else: - raise ValueError('Ext {} not supported for kind {}'.format(ext, kind)) + raise ValueError("Ext {} not supported for kind {}".format(ext, kind)) elif kind == HTML5_NODE: content_node = dict( @@ -336,7 +390,7 @@ def make_content_node(channeldir, rel_path, filename, metadata): license=license_dict, thumbnail=thumbnail_rel_path, derive_thumbnail=True, - files=[{'file_type':HTML5_FILE, 'path':filepath, 'language':lang}], + files=[{"file_type": HTML5_FILE, "path": filepath, "language": lang}], ) elif kind == EXERCISE_NODE: @@ -348,34 +402,36 @@ def make_content_node(channeldir, rel_path, filename, metadata): description=description, language=lang, license=license_dict, - exercise_data=metadata['exercise_data'], - questions=metadata['questions'], + exercise_data=metadata["exercise_data"], + questions=metadata["questions"], thumbnail=thumbnail_rel_path, derive_thumbnail=False, files=[], ) else: - raise ValueError('Not implemented case for kind ' + str(kind)) + raise ValueError("Not implemented case for kind " + str(kind)) return content_node - # AUTOMATIC REMOVAL OF TRAILING SLASHES FOR chenneldir ################################################################################ + class NonFolderError(Exception): pass + class FolderExistsAction(argparse.Action): """ Custom argparse action: verify the argument to be a folder (directory). The action will strip off trailing slashes from the folder's name. """ + def verify_folder_existence(self, folder_name): if not os.path.isdir(folder_name): - message = 'ERROR: {0} is not a folder'.format(folder_name) + message = "ERROR: {0} is not a folder".format(folder_name) raise NonFolderError(message) folder_name = folder_name.rstrip(os.sep) return folder_name diff --git a/ricecooker/utils/metadata_provider.py b/ricecooker/utils/metadata_provider.py index 179c6c61..ba34de2e 100644 --- a/ricecooker/utils/metadata_provider.py +++ b/ricecooker/utils/metadata_provider.py @@ -1,51 +1,52 @@ -from collections import defaultdict import csv import json import os import re -import requests +from collections import defaultdict from unicodedata import normalize -from le_utils.constants import content_kinds, exercises -from ricecooker.config import LOGGER -from ricecooker.utils.libstudio import StudioApi +import requests +from le_utils.constants import content_kinds +from le_utils.constants import exercises from ricecooker.classes.questions import MARKDOWN_IMAGE_REGEX +from ricecooker.config import LOGGER +from ricecooker.utils.libstudio import StudioApi # CONSTANTS ################################################################################ -DEFAULT_EXTRA_ITEMS_SEPARATOR = '🍣' # used to separate list-like data in CSV -CSV_STR_TRUE_VALUES = ['on', 'yes', '1', 'true'] -CSV_STR_FALSE_VALUES = ['off', 'no', '0', 'false'] - -DEFAULT_CHANNEL_INFO_FILENAME = 'Channel.csv' -CHANNEL_TITLE_KEY = 'Title' -CHANNEL_DESCRIPTION_KEY = 'Description' -CHANNEL_DOMAIN_KEY = 'Domain' -CHANNEL_SOURCEID_KEY = 'Source ID' -CHANNEL_LANGUAGE_KEY = 'Language' -CHANNEL_THUMBNAIL_KEY = 'Thumbnail' +DEFAULT_EXTRA_ITEMS_SEPARATOR = "🍣" # used to separate list-like data in CSV +CSV_STR_TRUE_VALUES = ["on", "yes", "1", "true"] +CSV_STR_FALSE_VALUES = ["off", "no", "0", "false"] + +DEFAULT_CHANNEL_INFO_FILENAME = "Channel.csv" +CHANNEL_TITLE_KEY = "Title" +CHANNEL_DESCRIPTION_KEY = "Description" +CHANNEL_DOMAIN_KEY = "Domain" +CHANNEL_SOURCEID_KEY = "Source ID" +CHANNEL_LANGUAGE_KEY = "Language" +CHANNEL_THUMBNAIL_KEY = "Thumbnail" CHANNEL_INFO_HEADER = [ CHANNEL_TITLE_KEY, CHANNEL_DESCRIPTION_KEY, CHANNEL_DOMAIN_KEY, CHANNEL_SOURCEID_KEY, CHANNEL_LANGUAGE_KEY, - CHANNEL_THUMBNAIL_KEY + CHANNEL_THUMBNAIL_KEY, ] -DEFAULT_CONTENT_INFO_FILENAME = 'Content.csv' -CONTENT_PATH_KEY = 'Path *' -CONTENT_TITLE_KEY = 'Title *' -CONTENT_SOURCEID_KEY = 'Source ID' -CONTENT_DESCRIPTION_KEY = 'Description' -CONTENT_AUTHOR_KEY = 'Author' -CONTENT_LANGUAGE_KEY = 'Language' -CONTENT_LICENSE_ID_KEY = 'License ID *' -CONTENT_LICENSE_DESCRIPTION_KEY = 'License Description' -CONTENT_LICENSE_COPYRIGHT_HOLDER_KEY = 'Copyright Holder' -CONTENT_THUMBNAIL_KEY = 'Thumbnail' +DEFAULT_CONTENT_INFO_FILENAME = "Content.csv" +CONTENT_PATH_KEY = "Path *" +CONTENT_TITLE_KEY = "Title *" +CONTENT_SOURCEID_KEY = "Source ID" +CONTENT_DESCRIPTION_KEY = "Description" +CONTENT_AUTHOR_KEY = "Author" +CONTENT_LANGUAGE_KEY = "Language" +CONTENT_LICENSE_ID_KEY = "License ID *" +CONTENT_LICENSE_DESCRIPTION_KEY = "License Description" +CONTENT_LICENSE_COPYRIGHT_HOLDER_KEY = "Copyright Holder" +CONTENT_THUMBNAIL_KEY = "Thumbnail" CONTENT_INFO_HEADER = [ CONTENT_PATH_KEY, CONTENT_TITLE_KEY, @@ -56,14 +57,14 @@ CONTENT_LICENSE_ID_KEY, CONTENT_LICENSE_DESCRIPTION_KEY, CONTENT_LICENSE_COPYRIGHT_HOLDER_KEY, - CONTENT_THUMBNAIL_KEY + CONTENT_THUMBNAIL_KEY, ] -DEFAULT_EXERCISES_INFO_FILENAME = 'Exercises.csv' -EXERCISE_SOURCEID_KEY = 'Source ID *' -EXERCISE_M_KEY = 'Number Correct' # (integer) -EXERCISE_N_KEY = 'Out of Total' # (integer) -EXERCISE_RANDOMIZE_KEY = 'Randomize' # Use 'true' (default) or 'false' +DEFAULT_EXERCISES_INFO_FILENAME = "Exercises.csv" +EXERCISE_SOURCEID_KEY = "Source ID *" +EXERCISE_M_KEY = "Number Correct" # (integer) +EXERCISE_N_KEY = "Out of Total" # (integer) +EXERCISE_RANDOMIZE_KEY = "Randomize" # Use 'true' (default) or 'false' EXERCISE_INFO_HEADER = [ CONTENT_PATH_KEY, CONTENT_TITLE_KEY, @@ -77,30 +78,38 @@ EXERCISE_M_KEY, EXERCISE_N_KEY, EXERCISE_RANDOMIZE_KEY, - CONTENT_THUMBNAIL_KEY + CONTENT_THUMBNAIL_KEY, ] -DEFAULT_EXERCISE_QUESTIONS_INFO_FILENAME = 'ExerciseQuestions.csv' -EXERCISE_QUESTIONS_QUESTIONID_KEY = 'Question ID *' # unique idendifier for this question -EXERCISE_QUESTIONS_TYPE_KEY = 'Question type *' # one of ['SingleSelectQuestion', 'MultipleSelectQuestion', 'InputQuestion'] -EXERCISE_QUESTIONS_QUESTION_KEY = 'Question *' # string that contains the question setup and the prompt -EXERCISE_QUESTIONS_OPTION_A_KEY = 'Option A' -EXERCISE_QUESTIONS_OPTION_B_KEY = 'Option B' -EXERCISE_QUESTIONS_OPTION_C_KEY = 'Option C' -EXERCISE_QUESTIONS_OPTION_D_KEY = 'Option D' -EXERCISE_QUESTIONS_OPTION_E_KEY = 'Option E' -EXERCISE_QUESTIONS_OPTION_FGHI_KEY = 'Options F...' # This field can contain a list of multiple '🍣'-separated string values, - # e.g., 'Anser F🍣Answer G🍣Answer H' (or other suitable unicode character) -EXERCISE_QUESTIONS_CORRECT_ANSWER_KEY = 'Correct Answer *' # A string that equals one of the options strings -EXERCISE_QUESTIONS_CORRECT_ANSWER2_KEY = 'Correct Answer 2' # (for multiple select) -EXERCISE_QUESTIONS_CORRECT_ANSWER3_KEY = 'Correct Answer 3' # (for multiple select) -EXERCISE_QUESTIONS_HINT_1_KEY = 'Hint 1' -EXERCISE_QUESTIONS_HINT_2_KEY = 'Hint 2' -EXERCISE_QUESTIONS_HINT_3_KEY = 'Hint 3' -EXERCISE_QUESTIONS_HINT_4_KEY = 'Hint 4' -EXERCISE_QUESTIONS_HINT_5_KEY = 'Hint 5' -EXERCISE_QUESTIONS_HINT_6789_KEY = 'Hint 6+' # This field can contain a list of multiple '🍣'-separated string values, - # e.g., 'Hint 6 text🍣Hint 7 text🍣Hing 8 text' +DEFAULT_EXERCISE_QUESTIONS_INFO_FILENAME = "ExerciseQuestions.csv" +EXERCISE_QUESTIONS_QUESTIONID_KEY = ( + "Question ID *" # unique idendifier for this question +) +EXERCISE_QUESTIONS_TYPE_KEY = "Question type *" # one of ['SingleSelectQuestion', 'MultipleSelectQuestion', 'InputQuestion'] +EXERCISE_QUESTIONS_QUESTION_KEY = ( + "Question *" # string that contains the question setup and the prompt +) +EXERCISE_QUESTIONS_OPTION_A_KEY = "Option A" +EXERCISE_QUESTIONS_OPTION_B_KEY = "Option B" +EXERCISE_QUESTIONS_OPTION_C_KEY = "Option C" +EXERCISE_QUESTIONS_OPTION_D_KEY = "Option D" +EXERCISE_QUESTIONS_OPTION_E_KEY = "Option E" +EXERCISE_QUESTIONS_OPTION_FGHI_KEY = "Options F..." # This field can contain a list of multiple '🍣'-separated string values, +# e.g., 'Anser F🍣Answer G🍣Answer H' (or other suitable unicode character) +EXERCISE_QUESTIONS_CORRECT_ANSWER_KEY = ( + "Correct Answer *" # A string that equals one of the options strings +) +EXERCISE_QUESTIONS_CORRECT_ANSWER2_KEY = "Correct Answer 2" # (for multiple select) +EXERCISE_QUESTIONS_CORRECT_ANSWER3_KEY = "Correct Answer 3" # (for multiple select) +EXERCISE_QUESTIONS_HINT_1_KEY = "Hint 1" +EXERCISE_QUESTIONS_HINT_2_KEY = "Hint 2" +EXERCISE_QUESTIONS_HINT_3_KEY = "Hint 3" +EXERCISE_QUESTIONS_HINT_4_KEY = "Hint 4" +EXERCISE_QUESTIONS_HINT_5_KEY = "Hint 5" +EXERCISE_QUESTIONS_HINT_6789_KEY = ( + "Hint 6+" # This field can contain a list of multiple '🍣'-separated string values, +) +# e.g., 'Hint 6 text🍣Hint 7 text🍣Hing 8 text' EXERCISE_QUESTIONS_INFO_HEADER = [ EXERCISE_SOURCEID_KEY, EXERCISE_QUESTIONS_QUESTIONID_KEY, @@ -120,13 +129,14 @@ EXERCISE_QUESTIONS_HINT_3_KEY, EXERCISE_QUESTIONS_HINT_4_KEY, EXERCISE_QUESTIONS_HINT_5_KEY, - EXERCISE_QUESTIONS_HINT_6789_KEY + EXERCISE_QUESTIONS_HINT_6789_KEY, ] # HELPER FUNCTIONS ################################################################################ + def path_to_tuple(path): """ Split a current file system path into individual parts and form a tuple for key lookups. @@ -146,7 +156,7 @@ def path_to_tuple(path): # Normalize UTF-8 encoding to consistent form so cache lookups will work, see # https://docs.python.org/3.6/library/unicodedata.html#unicodedata.normalize - path_tup = tuple(normalize('NFD', part) for part in allparts) + path_tup = tuple(normalize("NFD", part) for part in allparts) return path_tup @@ -155,15 +165,16 @@ def input_path_to_tuple(path, windows=False): Split `chan_path` into individual parts and form a tuple (used as key). """ if windows: - path_tup = tuple(path.split('\\')) + path_tup = tuple(path.split("\\")) else: - path_tup = tuple(path.split('/')) + path_tup = tuple(path.split("/")) # # Normalize UTF-8 encoding to consistent form so cache lookups will work, see # https://docs.python.org/3.6/library/unicodedata.html#unicodedata.normalize - path_tup = tuple(normalize('NFD', part) for part in path_tup) + path_tup = tuple(normalize("NFD", part) for part in path_tup) return path_tup + def get_metadata_file_path(channeldir, filename): """ Return the path to the metadata file named `filename` that is a sibling of `channeldir`. @@ -172,10 +183,10 @@ def get_metadata_file_path(channeldir, filename): return os.path.join(channelparentdir, filename) - # METADATA PROVIDER BASE CLASS ################################################################################ + class MetadataProvider(object): def validate(self): """Check if metadata provided is valid.""" @@ -183,13 +194,16 @@ def validate(self): class CsvMetadataProvider(MetadataProvider): - - def __init__(self, channeldir, - channelinfo=DEFAULT_CHANNEL_INFO_FILENAME, - contentinfo=DEFAULT_CONTENT_INFO_FILENAME, - exercisesinfo=DEFAULT_EXERCISES_INFO_FILENAME, - questionsinfo=DEFAULT_EXERCISE_QUESTIONS_INFO_FILENAME, - winpaths=False, validate_and_cache=True): + def __init__( + self, + channeldir, + channelinfo=DEFAULT_CHANNEL_INFO_FILENAME, + contentinfo=DEFAULT_CONTENT_INFO_FILENAME, + exercisesinfo=DEFAULT_EXERCISES_INFO_FILENAME, + questionsinfo=DEFAULT_EXERCISE_QUESTIONS_INFO_FILENAME, + winpaths=False, + validate_and_cache=True, + ): """ Load the metadata from CSV files `channelinfo`, `contentinfo`, and optionally exericies data from `exercisesinfo` and `questionsinfo` files. @@ -203,14 +217,16 @@ def __init__(self, channeldir, self.contentinfo = contentinfo self.exercisesinfo = exercisesinfo self.questionsinfo = questionsinfo - self.contentcache = {} # { ('chan', 'path','as','tuple's) --> node metadata dict - self.exercise_filenames_in_dir = defaultdict(list) # { ('chan', 'path','some','dir) --> list of exercises (virtual filenames) + self.contentcache = ( + {} + ) # { ('chan', 'path','as','tuple's) --> node metadata dict + self.exercise_filenames_in_dir = defaultdict( + list + ) # { ('chan', 'path','some','dir) --> list of exercises (virtual filenames) self.winpaths = winpaths # paths separator in .csv is windows '\' if validate_and_cache: self.validate_headers() - self.cache_contentinfo() # read and parse CSV to build cache lookup table - - + self.cache_contentinfo() # read and parse CSV to build cache lookup table # MAIN METHODS ############################################################################ @@ -227,7 +243,9 @@ def cache_contentinfo(self): dict_reader = csv.DictReader(csv_lines) for row in dict_reader: row_dict = self._map_content_row_to_dict(row) - path_tuple = input_path_to_tuple(row_dict['chan_path'], windows=self.winpaths) + path_tuple = input_path_to_tuple( + row_dict["chan_path"], windows=self.winpaths + ) self.contentcache[path_tuple] = row_dict # Additional handling of data in Exercises.csv and ExerciseQuestions.txt @@ -239,8 +257,8 @@ def cache_contentinfo(self): dict_reader = csv.DictReader(csv_lines) for question_row in dict_reader: question_dict = self._map_exercise_question_row_to_dict(question_row) - question_source_id = question_dict['source_id'] - del question_dict['source_id'] + question_source_id = question_dict["source_id"] + del question_dict["source_id"] questions_by_source_id[question_source_id].append(question_dict) # B. Load exercises @@ -249,9 +267,11 @@ def cache_contentinfo(self): dict_reader = csv.DictReader(csv_lines) for exercise_row in dict_reader: exercise_dict = self._map_exercise_row_to_dict(exercise_row) - path_tuple = input_path_to_tuple(exercise_dict['chan_path'], windows=self.winpaths) - question_source_id = exercise_dict['source_id'] - exercise_dict['questions'] = questions_by_source_id[question_source_id] + path_tuple = input_path_to_tuple( + exercise_dict["chan_path"], windows=self.winpaths + ) + question_source_id = exercise_dict["source_id"] + exercise_dict["questions"] = questions_by_source_id[question_source_id] # B1: exercises are standard content nodes, so add to contentcache self.contentcache[path_tuple] = exercise_dict # B2: add exercise to list of virtual filanames for current folder @@ -268,10 +288,10 @@ def get(self, path_tuple): else: # TODO: make chef robust to missing metadata # LOGGER.error( - LOGGER.warning('No metadata found for path_tuple ' + str(path_tuple)) + LOGGER.warning("No metadata found for path_tuple " + str(path_tuple)) metadata = dict( filepath=os.path.sep.join(path_tuple), - title=os.path.sep.join(path_tuple) + title=os.path.sep.join(path_tuple), ) return metadata @@ -279,13 +299,15 @@ def get_channel_info(self): """ Returns the first data row from Channel.csv """ - csv_filename = get_metadata_file_path(channeldir=self.channeldir, filename=self.channelinfo) + csv_filename = get_metadata_file_path( + channeldir=self.channeldir, filename=self.channelinfo + ) csv_lines = _read_csv_lines(csv_filename) dict_reader = csv.DictReader(csv_lines) - channel_csvs_list = list(dict_reader) + channel_csvs_list = list(dict_reader) channel_csv = channel_csvs_list[0] if len(channel_csvs_list) > 1: - raise ValueError('Found multiple channel rows in ' + self.channelinfo) + raise ValueError("Found multiple channel rows in " + self.channelinfo) channel_cleaned = _clean_dict(channel_csv) channel_info = self._map_channel_row_to_dict(channel_cleaned) return channel_info @@ -297,20 +319,22 @@ def get_thumbnail_paths(self): thumbnail_path_tuples = [] # channel thumbnail channel_info = self.get_channel_info() - chthumbnail_path = channel_info.get('thumbnail_chan_path', None) + chthumbnail_path = channel_info.get("thumbnail_chan_path", None) if chthumbnail_path: - chthumbnail_path_tuple = input_path_to_tuple(chthumbnail_path, windows=self.winpaths) + chthumbnail_path_tuple = input_path_to_tuple( + chthumbnail_path, windows=self.winpaths + ) thumbnail_path_tuples.append(chthumbnail_path_tuple) # content thumbnails for content_file_path_tuple, row in self.contentcache.items(): - thumbnail_path = row.get('thumbnail_chan_path', None) + thumbnail_path = row.get("thumbnail_chan_path", None) if thumbnail_path: - thumbnail_path_tuple = input_path_to_tuple(thumbnail_path, windows=self.winpaths) + thumbnail_path_tuple = input_path_to_tuple( + thumbnail_path, windows=self.winpaths + ) thumbnail_path_tuples.append(thumbnail_path_tuple) return thumbnail_path_tuples - - # CHANNEL+CONTENT PARSING METHODS ############################################################################ @@ -326,7 +350,7 @@ def _map_channel_row_to_dict(self, row): source_domain=channel_cleaned[CHANNEL_DOMAIN_KEY], source_id=channel_cleaned[CHANNEL_SOURCEID_KEY], language=channel_cleaned[CHANNEL_LANGUAGE_KEY], - thumbnail_chan_path=channel_cleaned[CHANNEL_THUMBNAIL_KEY] + thumbnail_chan_path=channel_cleaned[CHANNEL_THUMBNAIL_KEY], ) return channel_dict @@ -341,7 +365,9 @@ def _map_content_row_to_dict(self, row): license_dict = dict( license_id=row_cleaned[CONTENT_LICENSE_ID_KEY], description=row_cleaned.get(CONTENT_LICENSE_DESCRIPTION_KEY, None), - copyright_holder=row_cleaned.get(CONTENT_LICENSE_COPYRIGHT_HOLDER_KEY, None) + copyright_holder=row_cleaned.get( + CONTENT_LICENSE_COPYRIGHT_HOLDER_KEY, None + ), ) else: license_dict = None @@ -354,12 +380,10 @@ def _map_content_row_to_dict(self, row): author=row_cleaned.get(CONTENT_AUTHOR_KEY, None), language=row_cleaned.get(CONTENT_LANGUAGE_KEY, None), license=license_dict, - thumbnail_chan_path=row_cleaned.get(CONTENT_THUMBNAIL_KEY, None) + thumbnail_chan_path=row_cleaned.get(CONTENT_THUMBNAIL_KEY, None), ) return row_dict - - # EXERCISES CSV PARSING METHODS ############################################################################ @@ -378,7 +402,6 @@ def get_exercises_for_dir(self, dir_path_tuple): """ return self.exercise_filenames_in_dir[dir_path_tuple] - def _map_exercise_row_to_dict(self, row): """ Convert dictionary keys from raw CSV Exercise format to ricecooker keys. @@ -389,7 +412,9 @@ def _map_exercise_row_to_dict(self, row): license_dict = dict( license_id=row_cleaned[CONTENT_LICENSE_ID_KEY], description=row_cleaned.get(CONTENT_LICENSE_DESCRIPTION_KEY, None), - copyright_holder=row_cleaned.get(CONTENT_LICENSE_COPYRIGHT_HOLDER_KEY, None) + copyright_holder=row_cleaned.get( + CONTENT_LICENSE_COPYRIGHT_HOLDER_KEY, None + ), ) else: license_dict = None @@ -401,17 +426,19 @@ def _map_exercise_row_to_dict(self, row): elif randomize_raw.lower() in CSV_STR_FALSE_VALUES: randomize = False else: - raise ValueError('Unrecognized value ' + randomize_raw + ' for randomzied key') + raise ValueError( + "Unrecognized value " + randomize_raw + " for randomzied key" + ) exercise_data = dict( mastery_model=exercises.M_OF_N, randomize=randomize, ) m_value = row_cleaned.get(EXERCISE_M_KEY, None) if m_value: - exercise_data['m'] = int(m_value) + exercise_data["m"] = int(m_value) n_value = row_cleaned.get(EXERCISE_N_KEY, None) if n_value: - exercise_data['n'] = int(n_value) + exercise_data["n"] = int(n_value) exercise_dict = dict( chan_path=row_cleaned[CONTENT_PATH_KEY], @@ -422,7 +449,7 @@ def _map_exercise_row_to_dict(self, row): language=row_cleaned.get(CONTENT_LANGUAGE_KEY, None), license=license_dict, exercise_data=exercise_data, - thumbnail_chan_path=row_cleaned.get(CONTENT_THUMBNAIL_KEY, None) + thumbnail_chan_path=row_cleaned.get(CONTENT_THUMBNAIL_KEY, None), ) return exercise_dict @@ -518,14 +545,12 @@ def _map_exercise_question_row_to_dict(self, row): hints=hints, ) elif question_type == exercises.PERSEUS_QUESTION: - raise ValueError('Perseus questions not currently supported in CSV workflow.') + raise ValueError( + "Perseus questions not currently supported in CSV workflow." + ) return question_dict - - - - # CSV VALIDATION METHODS ############################################################################ @@ -537,8 +562,12 @@ def validate_headers(self): self.validate_header(self.channeldir, self.channelinfo, CHANNEL_INFO_HEADER) self.validate_header(self.channeldir, self.contentinfo, CONTENT_INFO_HEADER) if self.has_exercises(): - self.validate_header(self.channeldir, self.exercisesinfo, EXERCISE_INFO_HEADER) - self.validate_header(self.channeldir, self.questionsinfo, EXERCISE_QUESTIONS_INFO_HEADER) + self.validate_header( + self.channeldir, self.exercisesinfo, EXERCISE_INFO_HEADER + ) + self.validate_header( + self.channeldir, self.questionsinfo, EXERCISE_QUESTIONS_INFO_HEADER + ) def validate_header(self, channeldir, filename, expected_header): """ @@ -550,8 +579,12 @@ def validate_header(self, channeldir, filename, expected_header): dict_reader = csv.DictReader(csv_lines) actual = set(dict_reader.fieldnames) if not actual == expected: - raise ValueError('Unexpected CSV file header in ' + csv_filename \ - + ' Expected header:' + str(expected)) + raise ValueError( + "Unexpected CSV file header in " + + csv_filename + + " Expected header:" + + str(expected) + ) def validate(self): """ @@ -559,7 +592,6 @@ def validate(self): """ pass # TODO - # Generate CSV metadata from a given studio_id ############################################################################ @@ -568,117 +600,124 @@ def generate_exercises_from_importstudioid(self, args, options): Create rows in Exercises.csv and ExerciseQuestions.csv from a Studio channel, specified based on a studio_id (e.g. studio_id of main_tree for some channel)' """ - print('Generating Exercises.csv and ExerciseQuestions.csv from a Studio channel') - self.studioapi = StudioApi(token=args['token']) - channel_dict = self.studioapi.get_tree_for_studio_id(args['importstudioid']) - json.dump(channel_dict, open('chefdata/studiotree.json', 'w'), indent=4, ensure_ascii=False, sort_keys=True) + print( + "Generating Exercises.csv and ExerciseQuestions.csv from a Studio channel" + ) + self.studioapi = StudioApi(token=args["token"]) + channel_dict = self.studioapi.get_tree_for_studio_id(args["importstudioid"]) + json.dump( + channel_dict, + open("chefdata/studiotree.json", "w"), + indent=4, + ensure_ascii=False, + sort_keys=True, + ) soure_ids_seen = [] + def _generate_source_id(subtree): """ Creates a Source ID form title and ensures it is unique withing channel. """ - candidate = subtree['title'].replace(' ', '_') + candidate = subtree["title"].replace(" ", "_") if candidate not in soure_ids_seen: source_id = candidate soure_ids_seen.append(source_id) else: - source_id = candidate + subtree['node_id'][0:7] + source_id = candidate + subtree["node_id"][0:7] soure_ids_seen.append(source_id) return source_id def _write_subtree(path_tuple, subtree, is_root=False): - print(' '*len(path_tuple) + ' - ', subtree['title']) - kind = subtree['kind'] + print(" " * len(path_tuple) + " - ", subtree["title"]) + kind = subtree["kind"] # TOPIC ############################################################ - if kind == 'topic': + if kind == "topic": if is_root: - self.write_topic_row_from_studio_dict(path_tuple, subtree, is_root=is_root) - for child in subtree['children']: + self.write_topic_row_from_studio_dict( + path_tuple, subtree, is_root=is_root + ) + for child in subtree["children"]: _write_subtree(path_tuple, child) else: self.write_topic_row_from_studio_dict(path_tuple, subtree) - for child in subtree['children']: - _write_subtree(path_tuple+[subtree['title']], child) + for child in subtree["children"]: + _write_subtree(path_tuple + [subtree["title"]], child) # EXERCISE ######################################################### - elif kind == 'exercise': + elif kind == "exercise": source_id = _generate_source_id(subtree) self.write_exercice_row_from_studio_dict(path_tuple, subtree, source_id) - for question_dict in subtree['assessment_items']: + for question_dict in subtree["assessment_items"]: self.write_question_row_from_question_dict(source_id, question_dict) else: - print('skipping node', subtree['title']) + print("skipping node", subtree["title"]) - path_tuple = [ self.channeldir.split('/')[-1] ] + path_tuple = [self.channeldir.split("/")[-1]] _write_subtree(path_tuple, channel_dict, is_root=True) def write_commont_studio_dict_from_row(self, studio_dict, row): - if studio_dict['license']: - license_dict = self.studioapi.licenses_by_id[studio_dict['license']] + if studio_dict["license"]: + license_dict = self.studioapi.licenses_by_id[studio_dict["license"]] else: - license_dict = {'license_name': None} - row[CONTENT_TITLE_KEY] = studio_dict['title'] - row[CONTENT_DESCRIPTION_KEY] = studio_dict['description'] - row[CONTENT_AUTHOR_KEY] = studio_dict['author'] - row[CONTENT_LANGUAGE_KEY] = 'en' - row[CONTENT_LICENSE_ID_KEY] = license_dict['license_name'] + license_dict = {"license_name": None} + row[CONTENT_TITLE_KEY] = studio_dict["title"] + row[CONTENT_DESCRIPTION_KEY] = studio_dict["description"] + row[CONTENT_AUTHOR_KEY] = studio_dict["author"] + row[CONTENT_LANGUAGE_KEY] = "en" + row[CONTENT_LICENSE_ID_KEY] = license_dict["license_name"] row[CONTENT_LICENSE_DESCRIPTION_KEY] = None - row[CONTENT_LICENSE_COPYRIGHT_HOLDER_KEY] = studio_dict['copyright_holder'] + row[CONTENT_LICENSE_COPYRIGHT_HOLDER_KEY] = studio_dict["copyright_holder"] row[CONTENT_THUMBNAIL_KEY] = None - def write_topic_row_from_studio_dict(self, path_tuple, studio_dict, is_root=False): if is_root: return # print('Generating Content.csv rows folders and file in channeldir for path_tuple ', path_tuple, studio_dict['title']) file_path = get_metadata_file_path(self.channeldir, self.contentinfo) - with open(file_path, 'a') as csv_file: + with open(file_path, "a") as csv_file: csvwriter = csv.DictWriter(csv_file, CONTENT_INFO_HEADER) - title = studio_dict['title'] - path_with_self = '/'.join(path_tuple+[title]) + title = studio_dict["title"] + path_with_self = "/".join(path_tuple + [title]) if not os.path.exists(path_with_self): os.makedirs(path_with_self, exist_ok=True) topic_row = {} self.write_commont_studio_dict_from_row(studio_dict, topic_row) # WRITE TOPIC ROW topic_row[CONTENT_PATH_KEY] = path_with_self - topic_row[CONTENT_SOURCEID_KEY] = studio_dict['node_id'][0:7] + topic_row[CONTENT_SOURCEID_KEY] = studio_dict["node_id"][0:7] csvwriter.writerow(topic_row) - def write_exercice_row_from_studio_dict(self, path_tuple, studio_dict, source_id): file_path = get_metadata_file_path(self.channeldir, self.exercisesinfo) - with open(file_path, 'a') as csv_file: + with open(file_path, "a") as csv_file: csvwriter = csv.DictWriter(csv_file, EXERCISE_INFO_HEADER) exercise_row = {} self.write_commont_studio_dict_from_row(studio_dict, exercise_row) - exercise_title = studio_dict['title'] - exercise_row[CONTENT_PATH_KEY] = '/'.join(path_tuple+[exercise_title]) + exercise_title = studio_dict["title"] + exercise_row[CONTENT_PATH_KEY] = "/".join(path_tuple + [exercise_title]) exercise_row[EXERCISE_SOURCEID_KEY] = source_id # Exercises specifics - if isinstance(studio_dict['extra_fields'], str): - extra_fields = json.loads(studio_dict['extra_fields']) + if isinstance(studio_dict["extra_fields"], str): + extra_fields = json.loads(studio_dict["extra_fields"]) else: - extra_fields = studio_dict['extra_fields'] - exercise_row[EXERCISE_M_KEY] = int(extra_fields['m']) - exercise_row[EXERCISE_N_KEY] = int(extra_fields['n']) - exercise_row[EXERCISE_RANDOMIZE_KEY] = extra_fields['randomize'] + extra_fields = studio_dict["extra_fields"] + exercise_row[EXERCISE_M_KEY] = int(extra_fields["m"]) + exercise_row[EXERCISE_N_KEY] = int(extra_fields["n"]) + exercise_row[EXERCISE_RANDOMIZE_KEY] = extra_fields["randomize"] # WRITE EXERCISE ROW csvwriter.writerow(exercise_row) - - def _make_local_question_images(self, question_dict): """ Process all mardown image links in question_dict: - download them to local files under exerciseimages/ """ question_dict = question_dict.copy() - dest_path = 'exerciseimages/' + dest_path = "exerciseimages/" if not os.path.exists(dest_path): os.mkdir(dest_path) @@ -686,53 +725,58 @@ def _make_local_question_images(self, question_dict): # helper method def _process_string(string): image_regex = re.compile(MARKDOWN_IMAGE_REGEX, flags=re.IGNORECASE) - contentstorage_prefix = '${☣ CONTENTSTORAGE}/' - studio_storage = 'https://studio.learningequality.org/content/storage/' + contentstorage_prefix = "${☣ CONTENTSTORAGE}/" + studio_storage = "https://studio.learningequality.org/content/storage/" matches = image_regex.findall(string) # Parse all matches for match in matches: file_result = match[1] - file_name = file_result.replace(contentstorage_prefix, '') - file_url = studio_storage + file_name[0] + '/' + file_name[1] + '/' + file_name + file_name = file_result.replace(contentstorage_prefix, "") + file_url = ( + studio_storage + file_name[0] + "/" + file_name[1] + "/" + file_name + ) file_local_path = os.path.join(dest_path, file_name) response = requests.get(file_url) if response.status_code != 200: - print('Failed for image ' + str(response.status_code) + ' >> ' + file_url) + print( + "Failed for image " + + str(response.status_code) + + " >> " + + file_url + ) return string - with open(file_local_path, 'wb') as local_file: + with open(file_local_path, "wb") as local_file: local_file.write(response.content) - print('saved image file', file_local_path) + print("saved image file", file_local_path) string = string.replace(file_result, file_local_path) return string # Process images in question - new_question = _process_string(question_dict['question']) - question_dict['question'] = new_question + new_question = _process_string(question_dict["question"]) + question_dict["question"] = new_question # Process images in answers - answers = json.loads(question_dict['answers']) + answers = json.loads(question_dict["answers"]) new_answers = [] for ans in answers: new_ans = ans.copy() - new_ans['answer'] = _process_string(new_ans['answer']) + new_ans["answer"] = _process_string(new_ans["answer"]) new_answers.append(new_ans) - question_dict['answers'] = json.dumps(new_answers) + question_dict["answers"] = json.dumps(new_answers) # TODO: process hint images return question_dict - - def write_question_row_from_question_dict(self, source_id, question_dict): file_path = get_metadata_file_path(self.channeldir, self.questionsinfo) - if question_dict['type'] == 'perseus_question': - print('Skipping perseus_question -- not supported in CSV workflow.') + if question_dict["type"] == "perseus_question": + print("Skipping perseus_question -- not supported in CSV workflow.") return - with open(file_path, 'a') as csv_file: + with open(file_path, "a") as csv_file: csvwriter = csv.DictWriter(csv_file, EXERCISE_QUESTIONS_INFO_HEADER) def _safe_list_get(l, idx, default): @@ -745,48 +789,71 @@ def _safe_list_get(l, idx, default): question_dict = self._make_local_question_images(question_dict) type_lookup = { - 'single_selection': exercises.SINGLE_SELECTION, - 'true_false': exercises.SINGLE_SELECTION, - 'multiple_selection': exercises.MULTIPLE_SELECTION, - 'input_question': exercises.INPUT_QUESTION, + "single_selection": exercises.SINGLE_SELECTION, + "true_false": exercises.SINGLE_SELECTION, + "multiple_selection": exercises.MULTIPLE_SELECTION, + "input_question": exercises.INPUT_QUESTION, } # ANSWERS - answers = json.loads(question_dict['answers']) + answers = json.loads(question_dict["answers"]) options = [] # all options correct = [] # correct andwers for ans in answers: - options.append(ans['answer']) - if ans['correct']: - correct.append(ans['answer']) + options.append(ans["answer"]) + if ans["correct"]: + correct.append(ans["answer"]) extra_options = DEFAULT_EXTRA_ITEMS_SEPARATOR.join(options[5:]) # HINTS - hints_raw = json.loads(question_dict['hints']) + hints_raw = json.loads(question_dict["hints"]) if hints_raw: - raise ValueError('Found hints but not handled..') + raise ValueError("Found hints but not handled..") - LOGGER.info(' - writing question with studio_id=' + question_dict['assessment_id']) + LOGGER.info( + " - writing question with studio_id=" + + question_dict["assessment_id"] + ) question_row = {} question_row[EXERCISE_SOURCEID_KEY] = source_id - question_row[EXERCISE_QUESTIONS_QUESTIONID_KEY] = question_dict['assessment_id'] # question_dict['assessment_id'] - question_row[EXERCISE_QUESTIONS_TYPE_KEY] = type_lookup[question_dict['type']] - question_row[EXERCISE_QUESTIONS_QUESTION_KEY] = question_dict['question'] - question_row[EXERCISE_QUESTIONS_OPTION_A_KEY] = _safe_list_get(options, 0, None) - question_row[EXERCISE_QUESTIONS_OPTION_B_KEY] = _safe_list_get(options, 1, None) - question_row[EXERCISE_QUESTIONS_OPTION_C_KEY] = _safe_list_get(options, 2, None) - question_row[EXERCISE_QUESTIONS_OPTION_D_KEY] = _safe_list_get(options, 3, None) - question_row[EXERCISE_QUESTIONS_OPTION_E_KEY] = _safe_list_get(options, 4, None) + question_row[EXERCISE_QUESTIONS_QUESTIONID_KEY] = question_dict[ + "assessment_id" + ] # question_dict['assessment_id'] + question_row[EXERCISE_QUESTIONS_TYPE_KEY] = type_lookup[ + question_dict["type"] + ] + question_row[EXERCISE_QUESTIONS_QUESTION_KEY] = question_dict["question"] + question_row[EXERCISE_QUESTIONS_OPTION_A_KEY] = _safe_list_get( + options, 0, None + ) + question_row[EXERCISE_QUESTIONS_OPTION_B_KEY] = _safe_list_get( + options, 1, None + ) + question_row[EXERCISE_QUESTIONS_OPTION_C_KEY] = _safe_list_get( + options, 2, None + ) + question_row[EXERCISE_QUESTIONS_OPTION_D_KEY] = _safe_list_get( + options, 3, None + ) + question_row[EXERCISE_QUESTIONS_OPTION_E_KEY] = _safe_list_get( + options, 4, None + ) question_row[EXERCISE_QUESTIONS_OPTION_FGHI_KEY] = extra_options - question_row[EXERCISE_QUESTIONS_CORRECT_ANSWER_KEY] = _safe_list_get(correct, 0, None) - question_row[EXERCISE_QUESTIONS_CORRECT_ANSWER2_KEY] = _safe_list_get(correct, 1, None) - question_row[EXERCISE_QUESTIONS_CORRECT_ANSWER3_KEY] = _safe_list_get(correct, 2, None) - question_row[EXERCISE_QUESTIONS_HINT_1_KEY] = None # TODO - question_row[EXERCISE_QUESTIONS_HINT_2_KEY] = None # TODO - question_row[EXERCISE_QUESTIONS_HINT_3_KEY] = None # TODO - question_row[EXERCISE_QUESTIONS_HINT_4_KEY] = None # TODO - question_row[EXERCISE_QUESTIONS_HINT_5_KEY] = None # TODO - question_row[EXERCISE_QUESTIONS_HINT_6789_KEY] = None # TODO + question_row[EXERCISE_QUESTIONS_CORRECT_ANSWER_KEY] = _safe_list_get( + correct, 0, None + ) + question_row[EXERCISE_QUESTIONS_CORRECT_ANSWER2_KEY] = _safe_list_get( + correct, 1, None + ) + question_row[EXERCISE_QUESTIONS_CORRECT_ANSWER3_KEY] = _safe_list_get( + correct, 2, None + ) + question_row[EXERCISE_QUESTIONS_HINT_1_KEY] = None # TODO + question_row[EXERCISE_QUESTIONS_HINT_2_KEY] = None # TODO + question_row[EXERCISE_QUESTIONS_HINT_3_KEY] = None # TODO + question_row[EXERCISE_QUESTIONS_HINT_4_KEY] = None # TODO + question_row[EXERCISE_QUESTIONS_HINT_5_KEY] = None # TODO + question_row[EXERCISE_QUESTIONS_HINT_6789_KEY] = None # TODO # WRITE QUESTION ROW csvwriter.writerow(question_row) # 'files': [], @@ -796,9 +863,6 @@ def _safe_list_get(l, idx, default): # 'randomize': True, # 'deleted': False}, - - - # Generate CSV from folder structure in channeldir ############################################################################ @@ -806,34 +870,44 @@ def generate_contentinfo_from_channeldir(self, args, options): """ Create rows in Content.csv for each folder and file in `self.channeldir`. """ - LOGGER.info('Generating Content.csv rows folders and file in channeldir') + LOGGER.info("Generating Content.csv rows folders and file in channeldir") file_path = get_metadata_file_path(self.channeldir, self.contentinfo) - with open(file_path, 'a') as csv_file: + with open(file_path, "a") as csv_file: csvwriter = csv.DictWriter(csv_file, CONTENT_INFO_HEADER) - channeldir = args['channeldir'] + channeldir = args["channeldir"] if channeldir.endswith(os.path.sep): channeldir.rstrip(os.path.sep) # MAIN PROCESSING OF os.walk OUTPUT content_folders = sorted(os.walk(channeldir)) - _ = content_folders.pop(0) # Skip over channel root folder + _ = content_folders.pop(0) # Skip over channel root folder for rel_path, _subfolders, filenames in content_folders: - LOGGER.info('processing folder ' + str(rel_path)) + LOGGER.info("processing folder " + str(rel_path)) sorted_filenames = sorted(filenames) - self.generate_contentinfo_from_folder(csvwriter, rel_path, sorted_filenames) - LOGGER.info('Generted {} row for all folders and files in {}'.format(self.contentinfo, self.channeldir)) + self.generate_contentinfo_from_folder( + csvwriter, rel_path, sorted_filenames + ) + LOGGER.info( + "Generted {} row for all folders and files in {}".format( + self.contentinfo, self.channeldir + ) + ) def generate_contentinfo_from_folder(self, csvwriter, rel_path, filenames): """ Create a topic node row in Content.csv for the folder at `rel_path` and add content node rows for all the files in the `rel_path` folder. """ - LOGGER.debug('IN process_folder ' + str(rel_path) + ' ' + str(filenames)) - from ricecooker.utils.linecook import filter_filenames, filter_thumbnail_files, chan_path_from_rel_path + LOGGER.debug("IN process_folder " + str(rel_path) + " " + str(filenames)) + from ricecooker.utils.linecook import ( + filter_filenames, + filter_thumbnail_files, + chan_path_from_rel_path, + ) # WRITE TOPIC ROW - topicrow = self.channeldir_node_to_row( rel_path.split(os.path.sep) ) + topicrow = self.channeldir_node_to_row(rel_path.split(os.path.sep)) csvwriter.writerow(topicrow) # WRITE CONTENT NODE ROWS @@ -846,7 +920,6 @@ def generate_contentinfo_from_folder(self, csvwriter, rel_path, filenames): filerow = self.channeldir_node_to_row(path_tuple) csvwriter.writerow(filerow) - def channeldir_node_to_row(self, path_tuple): """ Return a dict with keys corresponding to Content.csv columns. @@ -854,18 +927,17 @@ def channeldir_node_to_row(self, path_tuple): row = dict() for key in CONTENT_INFO_HEADER: row[key] = None - row[CONTENT_PATH_KEY] = "/".join(path_tuple) # use / in .csv on Windows and UNIX - title = path_tuple[-1].replace('_', ' ') + row[CONTENT_PATH_KEY] = "/".join( + path_tuple + ) # use / in .csv on Windows and UNIX + title = path_tuple[-1].replace("_", " ") for ext in content_kinds.MAPPING.keys(): if title.endswith(ext): - title = title.replace('.'+ext, '') + title = title.replace("." + ext, "") row[CONTENT_TITLE_KEY] = title row[CONTENT_SOURCEID_KEY] = path_tuple[-1] return row - - - # UTILS ############################################################################ @@ -874,19 +946,27 @@ def generate_templates(self, exercise_questions=False): Create empty .csv files with the right headers and place them in the Will place files as siblings of directory `channeldir`. """ - self.generate_template(channeldir=self.channeldir, - filename=self.channelinfo, - header=CHANNEL_INFO_HEADER) - self.generate_template(channeldir=self.channeldir, - filename=self.contentinfo, - header=CONTENT_INFO_HEADER) + self.generate_template( + channeldir=self.channeldir, + filename=self.channelinfo, + header=CHANNEL_INFO_HEADER, + ) + self.generate_template( + channeldir=self.channeldir, + filename=self.contentinfo, + header=CONTENT_INFO_HEADER, + ) if exercise_questions: - self.generate_template(channeldir=self.channeldir, - filename=self.exercisesinfo, - header=EXERCISE_INFO_HEADER) - self.generate_template(channeldir=self.channeldir, - filename=self.questionsinfo, - header=EXERCISE_QUESTIONS_INFO_HEADER) + self.generate_template( + channeldir=self.channeldir, + filename=self.exercisesinfo, + header=EXERCISE_INFO_HEADER, + ) + self.generate_template( + channeldir=self.channeldir, + filename=self.questionsinfo, + header=EXERCISE_QUESTIONS_INFO_HEADER, + ) def generate_template(self, channeldir, filename, header): """ @@ -895,7 +975,7 @@ def generate_template(self, channeldir, filename, header): """ file_path = get_metadata_file_path(channeldir, filename) if not os.path.exists(file_path): - with open(file_path, 'w') as csv_file: + with open(file_path, "w") as csv_file: csvwriter = csv.DictWriter(csv_file, header) csvwriter.writeheader() @@ -905,7 +985,7 @@ def _read_csv_lines(path): Opens CSV file `path` and returns list of rows. Pass output of this function to `csv.DictReader` for reading data. """ - csv_file = open(path, 'r') + csv_file = open(path, "r") csv_lines_raw = csv_file.readlines() csv_lines_clean = [line for line in csv_lines_raw if len(line.strip()) > 0] return csv_lines_clean @@ -917,15 +997,13 @@ def _clean_dict(row): """ row_cleaned = {} for key, val in row.items(): - if val is None or val == '': + if val is None or val == "": row_cleaned[key] = None else: row_cleaned[key] = val return row_cleaned - - class ExcelMetadataProvider(MetadataProvider): # LIBRARIES COULD USE # https://github.com/jmcnamara/XlsxWriter/blob/95334f999d3a5fb58d8da3197260e920be357638/dev/docs/source/alternatives.rst diff --git a/ricecooker/utils/paths.py b/ricecooker/utils/paths.py index c522b109..002abf0f 100644 --- a/ricecooker/utils/paths.py +++ b/ricecooker/utils/paths.py @@ -15,7 +15,7 @@ def file_exists(filepath): def get_name_from_url(url): """ - get the filename from a url + get the filename from a url url = http://abc.com/xyz.txt get_name_from_url(url) -> xyz.txt """ @@ -37,7 +37,7 @@ def get_name_from_url(url): def get_name_from_url_no_ext(url): """ - get the filename without the extension name from a url + get the filename without the extension name from a url url = http://abc.com/xyz.txt get_name_from_url(url) -> xyz """ diff --git a/ricecooker/utils/pdf.py b/ricecooker/utils/pdf.py index 0fc58daf..9b3562e1 100644 --- a/ricecooker/utils/pdf.py +++ b/ricecooker/utils/pdf.py @@ -1,8 +1,11 @@ import os -from PyPDF2 import PdfFileWriter, PdfFileReader -from PyPDF2.generic import Destination, NullObject +from PyPDF2 import PdfFileReader +from PyPDF2 import PdfFileWriter +from PyPDF2.generic import Destination +from PyPDF2.generic import NullObject from PyPDF2.utils import PdfReadError + from ricecooker.utils.downloader import read @@ -13,6 +16,7 @@ def __init__(self, title, page, typ, *args): except PdfReadError: pass + class CustomPDFReader(PdfFileReader): def _buildDestination(self, title, array): page, typ = array[0:2] @@ -24,7 +28,8 @@ class PDFParser(object): """ Helper class for extracting table of contents and splitting PDFs into chapters. """ - path = None # Local path to source PDF document that will be processed + + path = None # Local path to source PDF document that will be processed def __init__(self, source_path, directory="downloads"): self.directory = directory @@ -58,20 +63,19 @@ def open(self, update=False): with open(self.path, "wb") as fobj: fobj.write(read(self.source_path)) - self.file = open(self.path, 'rb') + self.file = open(self.path, "rb") self.pdf = CustomPDFReader(self.file) def close(self): """ Close main pdf file when done. """ - self.file.close() # Make sure zipfile closes no matter what + self.file.close() # Make sure zipfile closes no matter what def check_path(self): if not self.path: raise ValueError("self.path not found; call `open` first") - def get_toc(self, subchapters=False): """ Returns table-of-contents information extracted from the PDF doc. @@ -96,10 +100,12 @@ def get_toc(self, subchapters=False): for dest in self.pdf.getOutlines(): # Process chapters - if isinstance(dest, CustomDestination) and not isinstance(dest['/Page'], NullObject): + if isinstance(dest, CustomDestination) and not isinstance( + dest["/Page"], NullObject + ): page_num = self.pdf.getDestinationPageNumber(dest) chapter_pagerange = { - "title": dest['/Title'].replace('\xa0', ' '), + "title": dest["/Title"].replace("\xa0", " "), "page_start": page_num if index != 0 else 0, "page_end": self.pdf.numPages, } @@ -122,37 +128,43 @@ def get_toc(self, subchapters=False): parent = chapters[index - 1] subindex = 0 for subdest in dest: - if isinstance(subdest, CustomDestination) and not isinstance(subdest['/Page'], NullObject): + if isinstance(subdest, CustomDestination) and not isinstance( + subdest["/Page"], NullObject + ): subpage_num = self.pdf.getDestinationPageNumber(subdest) - parent['children'].append({ - "title": subdest['/Title'].replace('\xa0', ' '), - "page_start": subpage_num, - "page_end": self.pdf.numPages - }) + parent["children"].append( + { + "title": subdest["/Title"].replace("\xa0", " "), + "page_start": subpage_num, + "page_end": self.pdf.numPages, + } + ) if subindex > 0: - parent['children'][subindex - 1]["page_end"] = subpage_num - subindex +=1 + parent["children"][subindex - 1]["page_end"] = subpage_num + subindex += 1 return chapters - - def write_pagerange(self, pagerange, prefix=''): + def write_pagerange(self, pagerange, prefix=""): """ Save the subset of pages specified in `pagerange` (dict) as separate PDF. e.g. pagerange = {'title':'First chapter', 'page_start':0, 'page_end':5} """ writer = PdfFileWriter() - slug = "".join([c for c in pagerange['title'].replace(" ", "-") if c.isalnum() or c == "-"]) - write_to_path = os.path.sep.join([self.directory, "{}{}.pdf".format(prefix, slug)]) - for page in range(pagerange['page_start'], pagerange['page_end']): + slug = "".join( + [c for c in pagerange["title"].replace(" ", "-") if c.isalnum() or c == "-"] + ) + write_to_path = os.path.sep.join( + [self.directory, "{}{}.pdf".format(prefix, slug)] + ) + for page in range(pagerange["page_start"], pagerange["page_end"]): writer.addPage(self.pdf.getPage(page)) - writer.removeLinks() # must be done every page - with open(write_to_path, 'wb') as outfile: + writer.removeLinks() # must be done every page + with open(write_to_path, "wb") as outfile: writer.write(outfile) return write_to_path - - def split_chapters(self, jsondata=None, prefix=''): + def split_chapters(self, jsondata=None, prefix=""): """ Split the PDF doc into individual chapters based on the page-range info, storing individual split PDFs in the output folder `self.directory`. @@ -164,12 +176,11 @@ def split_chapters(self, jsondata=None, prefix=''): toc = jsondata or self.get_toc() chapters = [] for index, chpagerange in enumerate(toc): - newprefix = prefix + str(index) + '-' + newprefix = prefix + str(index) + "-" write_to_path = self.write_pagerange(chpagerange, prefix=newprefix) - chapters.append({"title": chpagerange['title'], "path": write_to_path}) + chapters.append({"title": chpagerange["title"], "path": write_to_path}) return chapters - def split_subchapters(self, jsondata=None): """ Transform a PDF doc into tree of chapters (topics) and subchapters (docs) @@ -184,31 +195,37 @@ def split_subchapters(self, jsondata=None): for index, chpagerange in enumerate(toc): # chapter prefix of the form 1-, 2-, 3-,... to avoid name conflicsts - chprefix = str(index) + '-' + chprefix = str(index) + "-" # Case A: chapter with no subchapters - if 'children' not in chpagerange or not chpagerange['children']: + if "children" not in chpagerange or not chpagerange["children"]: write_to_path = self.write_pagerange(chpagerange, prefix=chprefix) - chapters.append({"title": chpagerange['title'], "path": write_to_path}) + chapters.append({"title": chpagerange["title"], "path": write_to_path}) # Case B: chapter with subchapters - elif 'children' in chpagerange: - chapter_topic = { 'title': chpagerange['title'], 'children': [] } - subchpageranges = chpagerange['children'] + elif "children" in chpagerange: + chapter_topic = {"title": chpagerange["title"], "children": []} + subchpageranges = chpagerange["children"] first_subchapter = subchpageranges[0] # Handle case when chapter has "intro pages" before first subchapter - if first_subchapter['page_start'] > chpagerange['page_start']: + if first_subchapter["page_start"] > chpagerange["page_start"]: chintro_pagerange = { - 'title': chpagerange['title'], - 'page_start': chpagerange['page_start'], - 'page_end': first_subchapter['page_start'] + "title": chpagerange["title"], + "page_start": chpagerange["page_start"], + "page_end": first_subchapter["page_start"], } - write_to_path = self.write_pagerange(chintro_pagerange, prefix=chprefix) - chapter_topic['children'].append({"title": chpagerange['title'], "path": write_to_path}) + write_to_path = self.write_pagerange( + chintro_pagerange, prefix=chprefix + ) + chapter_topic["children"].append( + {"title": chpagerange["title"], "path": write_to_path} + ) # Handle all subchapters - subchapter_nodes = self.split_chapters(jsondata=subchpageranges, prefix=chprefix) - chapter_topic['children'].extend(subchapter_nodes) + subchapter_nodes = self.split_chapters( + jsondata=subchpageranges, prefix=chprefix + ) + chapter_topic["children"].extend(subchapter_nodes) chapters.append(chapter_topic) return chapters diff --git a/ricecooker/utils/proxy.py b/ricecooker/utils/proxy.py index 5741bad1..1345eb84 100644 --- a/ricecooker/utils/proxy.py +++ b/ricecooker/utils/proxy.py @@ -7,40 +7,41 @@ import os import random import re -import requests import time +import requests -PROXY_LIST = [] # Current list of proxy servers to choose from - -RECENT_PROXIES = [] # Recently used proxies (to avoid using too often) -RECENT_MAX = 3 # Rotatate between at least 3 proxy servers +PROXY_LIST = [] # Current list of proxy servers to choose from -MAYBE_BROKEN_PROXIES = {} # {proxy: error_list} to keep track of proxy errors -ERROR_FORGET_TIME = 10 # Ignore proxy errors that are older than 10 mins -ERROR_THRESHOLD = 3 # Add to broken list if encounter 3 errs in 10 mins +RECENT_PROXIES = [] # Recently used proxies (to avoid using too often) +RECENT_MAX = 3 # Rotatate between at least 3 proxy servers -BROKEN_PROXIES = [] # Known-bad proxies (we want to void choosing these) -BROKEN_PROXIES_CACHE_FILENAME = 'broken_proxies.list' -BROKEN_CACHE_EXPIRE_MINS = 2*24*60 # Ignore broken proxy cache older than 2 days +MAYBE_BROKEN_PROXIES = {} # {proxy: error_list} to keep track of proxy errors +ERROR_FORGET_TIME = 10 # Ignore proxy errors that are older than 10 mins +ERROR_THRESHOLD = 3 # Add to broken list if encounter 3 errs in 10 mins +BROKEN_PROXIES = [] # Known-bad proxies (we want to void choosing these) +BROKEN_PROXIES_CACHE_FILENAME = "broken_proxies.list" +BROKEN_CACHE_EXPIRE_MINS = 2 * 24 * 60 # Ignore broken proxy cache older than 2 days # LOADERS ################################################################################ + def load_env_proxies(): """ Load data from the ENV variable PROXY_LIST (a ;-sparated list of proxies). """ - proxy_list_env_var = os.getenv('PROXY_LIST', None) - proxy_list_env_var = proxy_list_env_var.strip(';').strip() + proxy_list_env_var = os.getenv("PROXY_LIST", None) + proxy_list_env_var = proxy_list_env_var.strip(";").strip() if proxy_list_env_var: - return [proxy.strip() for proxy in proxy_list_env_var.split(';')] + return [proxy.strip() for proxy in proxy_list_env_var.split(";")] else: return [] + def load_broken_proxies_cache(): """ Load data from 'broken_proxies.list' if the file not too old. @@ -48,15 +49,15 @@ def load_broken_proxies_cache(): if not os.path.exists(BROKEN_PROXIES_CACHE_FILENAME): return [] mtime = os.path.getmtime(BROKEN_PROXIES_CACHE_FILENAME) - if (time.time() - mtime) > 60*BROKEN_CACHE_EXPIRE_MINS: + if (time.time() - mtime) > 60 * BROKEN_CACHE_EXPIRE_MINS: os.remove(BROKEN_PROXIES_CACHE_FILENAME) return [] broken_proxies = [] - with open(BROKEN_PROXIES_CACHE_FILENAME, 'r') as bpl_file: + with open(BROKEN_PROXIES_CACHE_FILENAME, "r") as bpl_file: for line in bpl_file.readlines(): line = line.strip() - if line and not line.startswith('#'): - broken_proxy = line.split('#')[0].strip() + if line and not line.startswith("#"): + broken_proxy = line.split("#")[0].strip() broken_proxies.append(broken_proxy) return broken_proxies @@ -66,18 +67,18 @@ def get_proxyscape_proxies(): Loads a list of `{ip_address}:{port}` for public proxy servers. """ PROXY_TIMOUT_LIMIT = "1000" - url = 'https://api.proxyscrape.com/?request=getproxies' - url += '&proxytype=http&country=all&ssl=yes&anonymity=all' - url += '&timeout=' + PROXY_TIMOUT_LIMIT + url = "https://api.proxyscrape.com/?request=getproxies" + url += "&proxytype=http&country=all&ssl=yes&anonymity=all" + url += "&timeout=" + PROXY_TIMOUT_LIMIT r = requests.get(url) - return r.text.split('\r\n') + return r.text.split("\r\n") def get_sslproxies_proxies(): - r = requests.get('https://sslproxies.org') + r = requests.get("https://sslproxies.org") matches = re.findall(r"\d+\.\d+\.\d+\.\d+\d+", r.text) - revised = [m.replace('', '') for m in matches] - proxies = [s.replace('', ':')[:-1] for s in revised] + revised = [m.replace("", "") for m in matches] + proxies = [s.replace("", ":")[:-1] for s in revised] return proxies @@ -90,8 +91,8 @@ def get_proxies(refresh=False): if len(PROXY_LIST) == 0 or refresh: # This is either the first run or force-refresh of the list is requested - if os.getenv('PROXY_LIST', None): - proxy_list = load_env_proxies() # (re)load ;-spearated list from ENV + if os.getenv("PROXY_LIST", None): + proxy_list = load_env_proxies() # (re)load ;-spearated list from ENV else: proxy_list = get_proxyscape_proxies() broken_proxy_list = load_broken_proxies_cache() @@ -102,10 +103,10 @@ def get_proxies(refresh=False): return PROXY_LIST - # MAIN ################################################################################ + def choose_proxy(): """ Main function called externally to get a random proxy from the PROXY_LIST. @@ -142,10 +143,10 @@ def choose_proxy(): return proxy - # ERROR LOGIC ################################################################################ + def record_error_for_proxy(proxy, exception=None): """ Record a problem with the proxy server `proxy`, optionally passing in the @@ -162,27 +163,27 @@ def record_error_for_proxy(proxy, exception=None): proxy_errors = MAYBE_BROKEN_PROXIES[proxy] recent_proxy_errors = [] for proxy_error in proxy_errors: - if (time.time() - proxy_error['timestamp']) < ERROR_FORGET_TIME*60: + if (time.time() - proxy_error["timestamp"]) < ERROR_FORGET_TIME * 60: recent_proxy_errors.append(proxy_error) recent_proxy_errors.append(error_dict) MAYBE_BROKEN_PROXIES[proxy] = recent_proxy_errors if len(recent_proxy_errors) >= ERROR_THRESHOLD: - reason = str(exception).split('\n')[0] if exception else None + reason = str(exception).split("\n")[0] if exception else None add_to_broken_proxy_list(proxy, reason=reason) else: MAYBE_BROKEN_PROXIES[proxy] = [error_dict] -def add_to_broken_proxy_list(proxy, reason=''): +def add_to_broken_proxy_list(proxy, reason=""): global BROKEN_PROXIES if not proxy in BROKEN_PROXIES: BROKEN_PROXIES.append(proxy) - with open(BROKEN_PROXIES_CACHE_FILENAME, 'a') as bpl_file: + with open(BROKEN_PROXIES_CACHE_FILENAME, "a") as bpl_file: line = proxy if reason: - line += ' # ' + str(reason) - bpl_file.write(line + '\n') + line += " # " + str(reason) + bpl_file.write(line + "\n") if proxy in PROXY_LIST: PROXY_LIST.remove(proxy) diff --git a/ricecooker/utils/subtitles.py b/ricecooker/utils/subtitles.py index 1c03339c..4b1442ee 100644 --- a/ricecooker/utils/subtitles.py +++ b/ricecooker/utils/subtitles.py @@ -1,9 +1,16 @@ import codecs -from pycaption import CaptionSet, WebVTTWriter -from pycaption import WebVTTReader, SRTReader, SAMIReader, SCCReader, DFXPReader -from pycaption import CaptionReadError, CaptionReadNoCaptions -from pycaption.base import DEFAULT_LANGUAGE_CODE + from le_utils.constants import file_formats +from pycaption import CaptionReadError +from pycaption import CaptionReadNoCaptions +from pycaption import CaptionSet +from pycaption import DFXPReader +from pycaption import SAMIReader +from pycaption import SCCReader +from pycaption import SRTReader +from pycaption import WebVTTReader +from pycaption import WebVTTWriter +from pycaption.base import DEFAULT_LANGUAGE_CODE LANGUAGE_CODE_UNKNOWN = DEFAULT_LANGUAGE_CODE @@ -13,6 +20,7 @@ class InvalidSubtitleFormatError(TypeError): """ Custom error indicating a format that is invalid """ + pass @@ -20,6 +28,7 @@ class InvalidSubtitleLanguageError(ValueError): """ Custom error indicating that the provided language isn't present in a captions file """ + pass @@ -28,6 +37,7 @@ class SubtitleReader: A wrapper class for the pycaption readers since the interface differs between all. This will call read with `LANGUAGE_CODE_UNKNOWN` if `requires_language` is `True` """ + def __init__(self, reader, requires_language=False): """ :param reader: A pycaption reader @@ -61,9 +71,9 @@ def read(self, caption_str): return self.reader.read(caption_str) except CaptionReadNoCaptions: - raise InvalidSubtitleFormatError('Caption file has no captions') + raise InvalidSubtitleFormatError("Caption file has no captions") except (CaptionReadError, UnicodeDecodeError) as e: - raise InvalidSubtitleFormatError('Caption file is invalid: {}'.format(e)) + raise InvalidSubtitleFormatError("Caption file is invalid: {}".format(e)) # allow other errors to be passed through @@ -71,6 +81,7 @@ class SubtitleConverter: """ This class converts subtitle files to the preferred VTT format """ + def __init__(self, readers, caption_str): """ :param readers: An array of `SubtitleReader` instances @@ -99,10 +110,12 @@ def get_caption_set(self): break else: self.caption_set = None - raise InvalidSubtitleFormatError('Subtitle file is unsupported or unreadable') + raise InvalidSubtitleFormatError( + "Subtitle file is unsupported or unreadable" + ) if self.caption_set.is_empty(): - raise InvalidSubtitleLanguageError('Captions set is invalid') + raise InvalidSubtitleLanguageError("Captions set is invalid") return self.caption_set def get_language_codes(self): @@ -141,7 +154,10 @@ def replace_unknown_language(self, lang_code): # Replace caption_set with new version, having replaced unknown language self.caption_set = CaptionSet( - captions, styles=dict(caption_set.get_styles()), layout_info=caption_set.layout_info) + captions, + styles=dict(caption_set.get_styles()), + layout_info=caption_set.layout_info, + ) def write(self, out_filename, lang_code): """ @@ -151,7 +167,7 @@ def write(self, out_filename, lang_code): :param out_filename: A string path to put the converted captions contents :param lang_code: A string of the language code to write """ - with codecs.open(out_filename, 'w', encoding='utf-8') as converted_file: + with codecs.open(out_filename, "w", encoding="utf-8") as converted_file: converted_file.write(self.convert(lang_code)) def convert(self, lang_code): @@ -168,12 +184,14 @@ def convert(self, lang_code): if not captions: raise InvalidSubtitleLanguageError( - "Language '{}' is not present in caption set".format(lang_code)) + "Language '{}' is not present in caption set".format(lang_code) + ) styles = caption_set.get_styles() layout_info = caption_set.get_layout_info(lang_code) lang_caption_set = CaptionSet( - {lang_code: captions}, styles=dict(styles), layout_info=layout_info) + {lang_code: captions}, styles=dict(styles), layout_info=layout_info + ) return self.writer.write(lang_caption_set) @@ -181,6 +199,7 @@ def convert(self, lang_code): # FACTORY FUNCTIONS # ##################### + def build_dfxp_reader(): return SubtitleReader(DFXPReader()) @@ -213,7 +232,7 @@ def build_vtt_reader(): def build_subtitle_reader(reader_format): if reader_format not in BUILD_READER_MAP: - raise InvalidSubtitleFormatError('Unsupported') + raise InvalidSubtitleFormatError("Unsupported") return BUILD_READER_MAP[reader_format]() @@ -256,9 +275,7 @@ def build_subtitle_converter_from_file(captions_filename, in_format=None): :return: A SubtitleConverter :rtype: SubtitleConverter """ - with codecs.open(captions_filename, encoding='utf-8') as captions_file: + with codecs.open(captions_filename, encoding="utf-8") as captions_file: captions_str = captions_file.read() return build_subtitle_converter(captions_str, in_format) - - diff --git a/ricecooker/utils/thumbscropping.py b/ricecooker/utils/thumbscropping.py index 7ce46462..bdf9e73e 100644 --- a/ricecooker/utils/thumbscropping.py +++ b/ricecooker/utils/thumbscropping.py @@ -1,21 +1,22 @@ import math import re -from PIL import Image import sys import types +from PIL import Image + # Useful for very coarse version differentiation. PY2 = sys.version_info[0] == 2 PY3 = sys.version_info[0] == 3 if PY3: - string_types = str, - integer_types = int, - class_types = type, + string_types = (str,) + integer_types = (int,) + class_types = (type,) text_type = str binary_type = bytes else: - string_types = basestring, + string_types = (basestring,) integer_types = (int, long) class_types = (type, types.ClassType) text_type = unicode @@ -59,7 +60,6 @@ def image_entropy(im): return -sum([p * math.log(p, 2) for p in hist if p != 0]) - def _compare_entropy(start_slice, end_slice, slice, difference): """ Calculate the entropy of two slices (from the start and end of an axis), @@ -80,8 +80,9 @@ def _compare_entropy(start_slice, end_slice, slice, difference): return slice, 0 - -def scale_and_crop(im, size, crop=False, upscale=False, zoom=None, target=None, **kwargs): +def scale_and_crop( + im, size, crop=False, upscale=False, zoom=None, target=None, **kwargs +): """ Handle scaling and cropping the source image. Images can be scaled / cropped against a single dimension by using zero @@ -148,9 +149,10 @@ def scale_and_crop(im, size, crop=False, upscale=False, zoom=None, target=None, if scale < 1.0 or (scale > 1.0 and upscale): # Resize the image to the target size boundary. Round the scaled # boundary sizes to avoid floating point errors. - im = im.resize((int(round(source_x * scale)), - int(round(source_y * scale))), - resample=Image.ANTIALIAS) + im = im.resize( + (int(round(source_x * scale)), int(round(source_y * scale))), + resample=Image.ANTIALIAS, + ) if crop: # Use integer values now. @@ -158,9 +160,9 @@ def scale_and_crop(im, size, crop=False, upscale=False, zoom=None, target=None, # Difference between new image size and requested size. diff_x = int(source_x - min(source_x, target_x)) diff_y = int(source_y - min(source_y, target_y)) - if crop != 'scale' and (diff_x or diff_y): + if crop != "scale" and (diff_x or diff_y): if isinstance(target, string_types): - target = re.match(r'(\d+)?,(\d+)?$', target) + target = re.match(r"(\d+)?,(\d+)?$", target) if target: target = target.groups() if target: @@ -178,8 +180,9 @@ def scale_and_crop(im, size, crop=False, upscale=False, zoom=None, target=None, box.append(int(min(source_x, box[0] + target_x))) box.append(int(min(source_y, box[1] + target_y))) # See if an edge cropping argument was provided. - edge_crop = (isinstance(crop, string_types) and - re.match(r'(?:(-?)(\d+))?,(?:(-?)(\d+))?$', crop)) + edge_crop = isinstance(crop, string_types) and re.match( + r"(?:(-?)(\d+))?,(?:(-?)(\d+))?$", crop + ) if edge_crop and filter(None, edge_crop.groups()): x_right, x_crop, y_bottom, y_crop = edge_crop.groups() if x_crop: @@ -199,7 +202,7 @@ def scale_and_crop(im, size, crop=False, upscale=False, zoom=None, target=None, box[1] = offset box[3] = source_y - (diff_y - offset) # See if the image should be "smart cropped". - elif crop == 'smart': + elif crop == "smart": left = top = 0 right, bottom = source_x, source_y while diff_x: diff --git a/ricecooker/utils/tokens.py b/ricecooker/utils/tokens.py index e57a1646..d2623ac4 100644 --- a/ricecooker/utils/tokens.py +++ b/ricecooker/utils/tokens.py @@ -1,9 +1,8 @@ - import json import os import sys -try: # to support Python 2.x. +try: # to support Python 2.x. input = raw_input except NameError: pass @@ -22,6 +21,7 @@ def get_env(envvar): else: return os.environ[envvar] + def get_content_curation_token(args_token): """ Get the token through one of four possible ways. Input `args_token` can be @@ -31,18 +31,19 @@ def get_content_curation_token(args_token): 3a. if environment variable STUDIO_TOKEN exists, we'll use that 3b. else we prompt the user interactively """ - if args_token != "#": # retrieval methods 1, 2 + if args_token != "#": # retrieval methods 1, 2 if os.path.isfile(args_token): - with open(args_token, 'r') as fobj: + with open(args_token, "r") as fobj: return fobj.read().strip() else: return args_token - else: # retrieval strategies 3 - token = get_env('STUDIO_TOKEN') or get_env('CONTENT_CURATION_TOKEN') + else: # retrieval strategies 3 + token = get_env("STUDIO_TOKEN") or get_env("CONTENT_CURATION_TOKEN") if token is not None: - return token # 3a + return token # 3a else: - return prompt_token(config.DOMAIN) # 3b + return prompt_token(config.DOMAIN) # 3b + def prompt_token(domain): """ @@ -51,9 +52,10 @@ def prompt_token(domain): Returns: token """ token = input("\nEnter content curation server token ('q' to quit): ").lower() - if token == 'q': + if token == "q": sys.exit() else: return token.strip() + # SUSHI_BAR_TOKEN = get_env('SUSHI_BAR_TOKEN') # TODO in near future diff --git a/ricecooker/utils/utils.py b/ricecooker/utils/utils.py index d8c17c16..f4d5ee8f 100644 --- a/ricecooker/utils/utils.py +++ b/ricecooker/utils/utils.py @@ -17,4 +17,8 @@ def make_dir_if_needed(path): class VideoURLFormatError(Exception): def __init__(self, url, expected_format): - self.message = "The video at {} does not appear to be a proper {} video URL.".format(url, expected_format) \ No newline at end of file + self.message = ( + "The video at {} does not appear to be a proper {} video URL.".format( + url, expected_format + ) + ) diff --git a/ricecooker/utils/videos.py b/ricecooker/utils/videos.py index 0c43c6fc..8bb9d893 100644 --- a/ricecooker/utils/videos.py +++ b/ricecooker/utils/videos.py @@ -1,6 +1,6 @@ +import logging import re import subprocess -import logging from le_utils.constants import format_presets @@ -9,6 +9,7 @@ LOGGER = logging.getLogger("VideoResource") LOGGER.setLevel(logging.DEBUG) + def guess_video_preset_by_resolution(videopath): """ Run `ffprobe` to find resolution classify as high resolution (video height >= 720), @@ -17,19 +18,31 @@ def guess_video_preset_by_resolution(videopath): """ try: LOGGER.debug("Entering 'guess_video_preset_by_resolution' method") - result = subprocess.check_output(['ffprobe', '-v', 'error', '-print_format', 'json', '-show_entries', - 'stream=width,height', '-of', 'default=noprint_wrappers=1', str(videopath)]) + result = subprocess.check_output( + [ + "ffprobe", + "-v", + "error", + "-print_format", + "json", + "-show_entries", + "stream=width,height", + "-of", + "default=noprint_wrappers=1", + str(videopath), + ] + ) LOGGER.debug("ffprobe stream result = {}".format(result)) - pattern = re.compile('width=([0-9]*)[^height]+height=([0-9]*)') + pattern = re.compile("width=([0-9]*)[^height]+height=([0-9]*)") match = pattern.search(str(result)) if match is None: return format_presets.VIDEO_LOW_RES width, height = int(match.group(1)), int(match.group(2)) if height >= 720: - LOGGER.info('Video preset from {} = high resolution'.format(videopath)) + LOGGER.info("Video preset from {} = high resolution".format(videopath)) return format_presets.VIDEO_HIGH_RES else: - LOGGER.info('Video preset from {} = low resolution'.format(videopath)) + LOGGER.info("Video preset from {} = low resolution".format(videopath)) return format_presets.VIDEO_LOW_RES except Exception as e: LOGGER.warning(e) @@ -42,14 +55,44 @@ def extract_thumbnail_from_video(fpath_in, fpath_out, overwrite=False): The thumbnail image will be written in the file object given in `fobj_out`. """ try: - result = subprocess.check_output(['ffprobe', '-v', 'error', '-show_entries', 'format=duration', '-of', - 'default=noprint_wrappers=1:nokey=1', "-loglevel", "panic", str(fpath_in)]) + result = subprocess.check_output( + [ + "ffprobe", + "-v", + "error", + "-show_entries", + "format=duration", + "-of", + "default=noprint_wrappers=1:nokey=1", + "-loglevel", + "panic", + str(fpath_in), + ] + ) midpoint = float(re.search("\d+\.\d+", str(result)).group()) / 2 # scale parameters are from https://trac.ffmpeg.org/wiki/Scaling scale = "scale=400:225:force_original_aspect_ratio=decrease,pad=400:225:(ow-iw)/2:(oh-ih)/2" - command = ['ffmpeg',"-y" if overwrite else "-n", '-i', str(fpath_in), "-vf", scale, "-vcodec", "png", "-nostats", - '-ss', str(midpoint), '-vframes', '1', '-q:v', '2', "-loglevel", "panic", str(fpath_out)] + command = [ + "ffmpeg", + "-y" if overwrite else "-n", + "-i", + str(fpath_in), + "-vf", + scale, + "-vcodec", + "png", + "-nostats", + "-ss", + str(midpoint), + "-vframes", + "1", + "-q:v", + "2", + "-loglevel", + "panic", + str(fpath_out), + ] subprocess.check_output(command, stderr=subprocess.STDOUT) except subprocess.CalledProcessError as e: raise ThumbnailGenerationError("{}: {}".format(e, e.output)) @@ -59,6 +102,7 @@ class VideoCompressionError(Exception): """ Custom error returned when `ffmpeg` compression exits with a non-zero status. """ + pass @@ -74,20 +118,47 @@ def compress_video(source_file_path, target_file, overwrite=False, **kwargs): # The output width and height for ffmpeg scale param must be divisible by 2 # using value -2 to get robust behaviour: maintains the aspect ratio and also # ensure the calculated dimension is divisible by 2 - if 'max_width' in kwargs: - scale = "'w=trunc(min(iw,{max_width})/2)*2:h=-2'".format(max_width=kwargs['max_width']) - elif 'max_height' in kwargs: - scale = "'w=-2:h=trunc(min(ih,{max_height})/2)*2'".format(max_height=kwargs['max_height']) + if "max_width" in kwargs: + scale = "'w=trunc(min(iw,{max_width})/2)*2:h=-2'".format( + max_width=kwargs["max_width"] + ) + elif "max_height" in kwargs: + scale = "'w=-2:h=trunc(min(ih,{max_height})/2)*2'".format( + max_height=kwargs["max_height"] + ) else: scale = "'w=-2:h=trunc(min(ih,480)/2)*2'" # default to max-height 480px # set constant rate factor, see https://trac.ffmpeg.org/wiki/Encode/H.264#crf - crf = kwargs['crf'] if 'crf' in kwargs else 32 + crf = kwargs["crf"] if "crf" in kwargs else 32 # run command - command = ["ffmpeg", "-y" if overwrite else "-n", "-i", source_file_path, "-profile:v", "baseline", - "-level", "3.0", "-b:a", "32k", "-ac", "1", "-vf", "scale={}".format(scale), - "-crf", str(crf), "-preset", "slow", "-v", "error", "-strict", "-2", "-stats", target_file] + command = [ + "ffmpeg", + "-y" if overwrite else "-n", + "-i", + source_file_path, + "-profile:v", + "baseline", + "-level", + "3.0", + "-b:a", + "32k", + "-ac", + "1", + "-vf", + "scale={}".format(scale), + "-crf", + str(crf), + "-preset", + "slow", + "-v", + "error", + "-strict", + "-2", + "-stats", + target_file, + ] try: subprocess.check_output(command, stderr=subprocess.STDOUT) except subprocess.CalledProcessError as e: diff --git a/ricecooker/utils/web.py b/ricecooker/utils/web.py index e72ee033..095e1f75 100644 --- a/ricecooker/utils/web.py +++ b/ricecooker/utils/web.py @@ -3,7 +3,6 @@ Note that we could not use html for the module name as recent versions of Python include their own html module. """ - import os from bs4 import BeautifulSoup @@ -13,15 +12,16 @@ class HTMLParser: """ HTMLParser contains a set of functions for parsing, scraping, and updating an HTML page. """ + def __init__(self, filename=None, html=None): self.filename = filename self.html = html self.link_tags = { - 'a': 'href', - 'audio': 'src', - 'img': 'src', - 'link': 'href', - 'script': 'src' + "a": "href", + "audio": "src", + "img": "src", + "link": "href", + "script": "src", } def get_links(self): @@ -34,7 +34,7 @@ def get_links(self): if self.html is None: basename = os.path.basename(self.filename) self.html = open(self.filename).read() - soup = BeautifulSoup(self.html, 'html.parser') + soup = BeautifulSoup(self.html, "html.parser") extracted_links = [] for tag_name in self.link_tags: @@ -43,11 +43,15 @@ def get_links(self): link = tag.get(self.link_tags[tag_name]) # don't include links to ourselves or # links # TODO: Should this part be moved to get_local_files instead? - if link and (basename and not link.startswith(basename)) and not link.strip().startswith("#"): - if '?' in link: - link, query = link.split('?') - if '#' in link: - link, marker = link.split('#') + if ( + link + and (basename and not link.startswith(basename)) + and not link.strip().startswith("#") + ): + if "?" in link: + link, query = link.split("?") + if "#" in link: + link, marker = link.split("#") extracted_links.append(link) return extracted_links @@ -63,7 +67,7 @@ def get_local_files(self): for link in links: # NOTE: This technically fails to handle file:// URLs, but we're highly unlikely to see # file:// URLs in any distributed package, so this is simpler than parsing out the protocol. - if not '://' in link: + if not "://" in link: local_links.append(link) return local_links @@ -78,7 +82,7 @@ def replace_links(self, links_to_replace): if self.html is None: basename = os.path.basename(self.filename) self.html = open(self.filename).read() - soup = BeautifulSoup(self.html, 'html.parser') + soup = BeautifulSoup(self.html, "html.parser") extracted_links = [] for tag_name in self.link_tags: @@ -88,4 +92,4 @@ def replace_links(self, links_to_replace): if link in links_to_replace: tag[self.link_tags[tag_name]] = links_to_replace[link] - return soup.prettify() \ No newline at end of file + return soup.prettify() diff --git a/ricecooker/utils/youtube.py b/ricecooker/utils/youtube.py index e933fd09..3274ab42 100644 --- a/ricecooker/utils/youtube.py +++ b/ricecooker/utils/youtube.py @@ -1,18 +1,18 @@ -from enum import Enum import copy import json import logging import os -import time import re -import youtube_dl - +import time from datetime import datetime +from enum import Enum + +import youtube_dl from le_utils.constants import languages -from ricecooker.config import LOGGER from . import proxy from . import utils +from ricecooker.config import LOGGER LOGGER = logging.getLogger("YouTubeResource") @@ -20,8 +20,8 @@ NON_NETWORK_ERRORS = [ - youtube_dl.utils.ExtractorError, # private and unlisted videos - youtube_dl.utils.PostProcessingError, # custom postprocessors failures + youtube_dl.utils.ExtractorError, # private and unlisted videos + youtube_dl.utils.PostProcessingError, # custom postprocessors failures ] @@ -42,6 +42,7 @@ class YouTubeResource(object): This class encapsulates functionality for information retrieval and download of YouTube resources. Resources may include videos, playlists and channels. """ + # If extract_info request takes longer than this we treat it as broken proxy EXTRACT_TIME_SLOW_LIMIT = 20 # in seconds @@ -51,22 +52,18 @@ def __init__(self, url, useproxy=True, high_resolution=False, options=None): :param url: URL of a YouTube resource. URL may point to a video, playlist or channel. """ - if not 'youtube.com' in url and not 'youtu.be' in url: - raise utils.VideoURLFormatError(url, 'YouTube') + if not "youtube.com" in url and not "youtu.be" in url: + raise utils.VideoURLFormatError(url, "YouTube") self.url = url self.subtitles = {} self.num_retries = 10 self.sleep_seconds = 0.5 - self.preferred_formats = { - 'video': 'mp4', - 'audio': 'm4a' - } + self.preferred_formats = {"video": "mp4", "audio": "m4a"} self.useproxy = useproxy self.high_resolution = high_resolution self.options = options self.client = None # this will become a YoutubeDL instance on first use - self.info = None # save detailed info_dict returned from extract_info - + self.info = None # save detailed info_dict returned from extract_info def get_resource_info(self, options=None): """ @@ -75,26 +72,26 @@ def get_resource_info(self, options=None): :return: A ricecooker-like dict of info about the channel, playlist or video. """ extract_info_options = dict( - verbose = True, # TODO(ivan) change this to quiet = True eventually - no_warnings = True, - no_color = True, + verbose=True, # TODO(ivan) change this to quiet = True eventually + no_warnings=True, + no_color=True, # By default, YouTubeDL will pick what it determines to be the best formats, but for consistency's sake # we want to always get preferred formats (default of mp4 and m4a) when possible. - format = "bestvideo[height<={maxheight}][ext={vext}]+bestaudio[ext={aext}]/best[height<={maxheight}][ext={vext}]".format( + format="bestvideo[height<={maxheight}][ext={vext}]+bestaudio[ext={aext}]/best[height<={maxheight}][ext={vext}]".format( maxheight=720 if self.high_resolution else 480, - vext=self.preferred_formats['video'], - aext=self.preferred_formats['audio'] + vext=self.preferred_formats["video"], + aext=self.preferred_formats["audio"], ), ) for i in range(self.num_retries): if self.useproxy: dl_proxy = proxy.choose_proxy() - extract_info_options['proxy'] = dl_proxy + extract_info_options["proxy"] = dl_proxy if self.options: extract_info_options.update(self.options) # init-time options if options: - extract_info_options.update(options) # additional options + extract_info_options.update(options) # additional options try: LOGGER.debug("YoutubeDL options = {}".format(extract_info_options)) @@ -103,17 +100,22 @@ def get_resource_info(self, options=None): LOGGER.debug("Calling extract_info for URL {}".format(self.url)) start_time = datetime.now() - self.info = self.client.extract_info(self.url, download=False, process=True) + self.info = self.client.extract_info( + self.url, download=False, process=True + ) end_time = datetime.now() # Mark slow proxies as broken extract_time = (end_time - start_time).total_seconds() - LOGGER.debug('extract_time = ' + str(extract_time)) + LOGGER.debug("extract_time = " + str(extract_time)) if self.useproxy and extract_time > self.EXTRACT_TIME_SLOW_LIMIT: - if 'entries' in self.info: + if "entries" in self.info: pass # it's OK for extract_info to be slow for playlists else: - proxy.record_error_for_proxy(dl_proxy, exception='extract_info took ' + extract_time + ' seconds') + proxy.record_error_for_proxy( + dl_proxy, + exception="extract_info took " + extract_time + " seconds", + ) LOGGER.info("Found slow proxy {}".format(dl_proxy)) # Format info JSON into ricecooker-like keys @@ -134,7 +136,6 @@ def get_resource_info(self, options=None): LOGGER.warning("Info extraction failed, retrying...") time.sleep(self.sleep_seconds) - def get_dir_name_from_url(self, url=None): """ Takes a URL and returns a directory name to store files in. @@ -148,7 +149,6 @@ def get_dir_name_from_url(self, url=None): name = name.split("?")[0] return " ".join(name.split("_")).title() - def download(self, base_path=None, useproxy=False, options=None): """ Download the YouTube resource(s) specified in `self.info`. If `self.info` @@ -159,17 +159,17 @@ def download(self, base_path=None, useproxy=False, options=None): download_dir = os.path.join(base_path, self.get_dir_name_from_url()) utils.make_dir_if_needed(download_dir) else: - download_dir = '.' + download_dir = "." if self.client is None or self.info is None: # download should always be called after self.info is available self.get_resource_info() # Set reasonable default download options... - self.client.params['outtmpl'] = '{}/%(id)s.%(ext)s'.format(download_dir) - self.client.params['writethumbnail'] = True # TODO(ivan): revisit this - self.client.params['continuedl'] = False # clean start to avoid errors - self.client.params['noprogress'] = True # progressbar doesn't log well + self.client.params["outtmpl"] = "{}/%(id)s.%(ext)s".format(download_dir) + self.client.params["writethumbnail"] = True # TODO(ivan): revisit this + self.client.params["continuedl"] = False # clean start to avoid errors + self.client.params["noprogress"] = True # progressbar doesn't log well if options: # ...but override them based on user choices when specified self.client.params.update(options) @@ -182,16 +182,20 @@ def download(self, base_path=None, useproxy=False, options=None): if useproxy: # If useproxy ovverride specified, choose a new proxy server: dl_proxy = proxy.choose_proxy() - self.client.params['proxy'] = dl_proxy + self.client.params["proxy"] = dl_proxy self.client._setup_opener() # this will re-initialize downloader - elif not useproxy and 'proxy' in self.client.params and self.client.params['proxy']: + elif ( + not useproxy + and "proxy" in self.client.params + and self.client.params["proxy"] + ): # Disable proxy if it was used for the get_resource_info call - self.client.params['proxy'] = None + self.client.params["proxy"] = None self.client._setup_opener() # this will re-initialize downloader try: self.info = self.client.process_ie_result(self.info, download=True) - LOGGER.debug('Finished process_ie_result successfully') + LOGGER.debug("Finished process_ie_result successfully") break except Exception as e: network_related_error = True @@ -209,25 +213,24 @@ def download(self, base_path=None, useproxy=False, options=None): os.remove(download_filename) LOGGER.warning(e) if i < self.num_retries - 1: - LOGGER.warning("Download {} failed, retrying...".format(i+1)) + LOGGER.warning("Download {} failed, retrying...".format(i + 1)) time.sleep(self.sleep_seconds) # Post-process results # TODO(ivan): handle post processing filename when custom `outtmpl` specified in options if self.info: edited_results = self._format_for_ricecooker(self.info) - if 'children' in edited_results: - for child in edited_results['children']: - vfilename = "{}.{}".format(child["id"], child['ext']) - child['filename'] = os.path.join(download_dir, vfilename) + if "children" in edited_results: + for child in edited_results["children"]: + vfilename = "{}.{}".format(child["id"], child["ext"]) + child["filename"] = os.path.join(download_dir, vfilename) else: - vfilename = "{}.{}".format(edited_results["id"], edited_results['ext']) - edited_results['filename'] = os.path.join(download_dir, vfilename) + vfilename = "{}.{}".format(edited_results["id"], edited_results["ext"]) + edited_results["filename"] = os.path.join(download_dir, vfilename) return edited_results else: return None - def get_resource_subtitles(self, options=None): """ Retrieves the subtitles for the video(s) represented by this resource. @@ -237,9 +240,9 @@ def get_resource_subtitles(self, options=None): :return: A dictionary object that contains information about video subtitles """ options_for_subtitles = dict( - writesubtitles = True, # extract subtitles info - allsubtitles = True, # get all available languages - writeautomaticsub = False, # do not include auto-generated subs + writesubtitles=True, # extract subtitles info + allsubtitles=True, # get all available languages + writeautomaticsub=False, # do not include auto-generated subs ) if options: options_for_subtitles.update(options) @@ -247,7 +250,6 @@ def get_resource_subtitles(self, options=None): info = self.get_resource_info(options=options_for_subtitles) return info - def _format_for_ricecooker(self, results): """ Internal method for converting YouTube resource info into the format expected by ricecooker. @@ -260,42 +262,41 @@ def _format_for_ricecooker(self, results): # dict mapping of field name and default value when not found. extracted_fields = { - 'id': '', - 'title': '', - 'description': '', - 'ext': 'mp4', - 'thumbnail': '', - 'webpage_url': '', - 'tags': [], - 'subtitles': {}, - 'requested_subtitles': '', - 'artist': '', - 'license': '', - '_type': 'video' + "id": "", + "title": "", + "description": "", + "ext": "mp4", + "thumbnail": "", + "webpage_url": "", + "tags": [], + "subtitles": {}, + "requested_subtitles": "", + "artist": "", + "license": "", + "_type": "video", } for field_name in extracted_fields: info_name = field_name - if info_name == '_type': - info_name = 'kind' - elif info_name == 'webpage_url': - info_name = 'source_url' + if info_name == "_type": + info_name = "kind" + elif info_name == "webpage_url": + info_name = "source_url" if field_name in results: leaf[info_name] = results[field_name] else: leaf[info_name] = extracted_fields[field_name] - if 'entries' in results: - leaf['children'] = [] - for entry in results['entries']: + if "entries" in results: + leaf["children"] = [] + for entry in results["entries"]: if entry is not None: - leaf['children'].append(self._format_for_ricecooker(entry)) + leaf["children"].append(self._format_for_ricecooker(entry)) else: LOGGER.info("Skipping None entry bcs failed extract info") return leaf - def check_for_content_issues(self, filter=False): """ Checks the YouTube resource and looks for any issues that may prevent download or distribution of the material, @@ -309,28 +310,27 @@ def check_for_content_issues(self, filter=False): output_video_info = copy.copy(resource_info) videos_with_warnings = [] if filter: - output_video_info['children'] = [] + output_video_info["children"] = [] - for video in resource_info['children']: + for video in resource_info["children"]: warnings = [] - if not video['license']: - warnings.append('no_license_specified') - elif video['license'].find("Creative Commons") == -1: - warnings.append('closed_license') + if not video["license"]: + warnings.append("no_license_specified") + elif video["license"].find("Creative Commons") == -1: + warnings.append("closed_license") if len(warnings) > 0: - videos_with_warnings.append({'video': video, 'warnings': warnings}) + videos_with_warnings.append({"video": video, "warnings": warnings}) elif filter: - output_video_info['children'].append(video) + output_video_info["children"].append(video) return videos_with_warnings, output_video_info - - # YOUTUBE LANGUAGE CODE HELPERS ################################################################################ + def get_language_with_alpha2_fallback(language_code): """ Lookup language code `language_code` (string) in the internal language codes, @@ -356,47 +356,48 @@ def is_youtube_subtitle_file_supported_language(language): """ language_obj = get_language_with_alpha2_fallback(language) if language_obj is None: - print('Found unsupported language code {}'.format(language)) + print("Found unsupported language code {}".format(language)) return False else: return True - # CONSTANTS for YouTube cache ################################################################################ -CHEFDATA_DIR = 'chefdata' -DEFAULT_YOUTUBE_CACHE_DIR = os.path.join(CHEFDATA_DIR, 'youtubecache') +CHEFDATA_DIR = "chefdata" +DEFAULT_YOUTUBE_CACHE_DIR = os.path.join(CHEFDATA_DIR, "youtubecache") # CONSTANTS for YouTube resources ################################################################################ YOUTUBE_VIDEO_REGEX = re.compile( - r'(https?://)?(www\.)?(youtube|youtu|youtube-nocookie)\.(com|be)/(watch\?v=|embed/|v/|.+\?v=)?(?P[A-Za-z0-9\-=_]{11})' + r"(https?://)?(www\.)?(youtube|youtu|youtube-nocookie)\.(com|be)/(watch\?v=|embed/|v/|.+\?v=)?(?P[A-Za-z0-9\-=_]{11})" ) YOUTUBE_PLAYLIST_URL_FORMAT = "https://www.youtube.com/playlist?list={0}" YOUTUBE_VIDEO_URL_FORMAT = "https://www.youtube.com/watch?v={0}" + class YouTubeTypes(Enum): """ Enum containing YouTube resource types """ + YOUTUBE_BASE = "YouTubeBase" YOUTUBE_VIDEO = "YouTubeVideo" YOUTUBE_PLAYLIST = "YouTubePlayList" YOUTUBE_CHANNEL = "YouTubeChannel" -class YouTubeUtils(object): +class YouTubeUtils(object): def __init__(self, id, type=YouTubeTypes.YOUTUBE_BASE): self.id = id self.type = type - self.cache_dir = '' - self.cache_path = '' - self.url = '' + self.cache_dir = "" + self.cache_path = "" + self.url = "" def __str__(self): - return '%s (%s)' % (self.type, self.cachename) + return "%s (%s)" % (self.type, self.cachename) def _get_youtube_info(self, use_proxy=True, use_cache=True, options=None): youtube_info = None @@ -412,7 +413,11 @@ def _get_youtube_info(self, use_proxy=True, use_cache=True, options=None): youtube_resource = YouTubeResource(self.url, useproxy=use_proxy) except youtube_dl.utils.ExtractorError as e: if "unavailable" in str(e): - LOGGER.error("==> [%s] Resource unavailable for URL: %s", self.__str__, self.url) + LOGGER.error( + "==> [%s] Resource unavailable for URL: %s", + self.__str__, + self.url, + ) return None if youtube_resource: @@ -420,24 +425,30 @@ def _get_youtube_info(self, use_proxy=True, use_cache=True, options=None): # Save YouTube info to JSON cache file youtube_info = youtube_resource.get_resource_info(options) if youtube_info: - json.dump(youtube_info, - open(self.cache_path, 'w'), - indent=4, - ensure_ascii=False, - sort_keys=True) + json.dump( + youtube_info, + open(self.cache_path, "w"), + indent=4, + ensure_ascii=False, + sort_keys=True, + ) else: - LOGGER.error("==> [%s] Failed to extract YouTube info", self.__str__()) + LOGGER.error( + "==> [%s] Failed to extract YouTube info", self.__str__() + ) except Exception as e: - LOGGER.error("==> [%s] Failed to get YouTube info: %s", self.__str__(), e) + LOGGER.error( + "==> [%s] Failed to get YouTube info: %s", self.__str__(), e + ) return None return youtube_info -class YouTubeVideoUtils(YouTubeUtils): +class YouTubeVideoUtils(YouTubeUtils): def __init_subclass__(cls): return super().__init_subclass__() - def __init__(self, id, alias='', cache_dir=''): + def __init__(self, id, alias="", cache_dir=""): """ Initializes YouTubeVideoUtils object with id :param id: YouTube video ID @@ -453,9 +464,11 @@ def __init__(self, id, alias='', cache_dir=''): self.cache_dir = DEFAULT_YOUTUBE_CACHE_DIR else: self.cache_dir = cache_dir - self.cache_path = os.path.join(self.cache_dir, self.cachename + '.json') + self.cache_path = os.path.join(self.cache_dir, self.cachename + ".json") - def get_video_info(self, use_proxy=True, use_cache=True, get_subtitle_languages=False, options=None): + def get_video_info( + self, use_proxy=True, use_cache=True, get_subtitle_languages=False, options=None + ): """ Get YouTube video info by either requesting URL or extracting local cache :param use_cache: Define if allowed to get video info from local JSON cache, default to True @@ -467,18 +480,20 @@ def get_video_info(self, use_proxy=True, use_cache=True, get_subtitle_languages= extract_options = dict() if get_subtitle_languages: options_for_subtitles = dict( - writesubtitles=True, # extract subtitles info - allsubtitles=True, # get all available languages + writesubtitles=True, # extract subtitles info + allsubtitles=True, # get all available languages writeautomaticsub=False, # do not include auto-generated subs ) extract_options.update(options_for_subtitles) if options: extract_options.update(options) - return self._get_youtube_info(use_proxy=use_proxy, use_cache=use_cache, options=extract_options) + return self._get_youtube_info( + use_proxy=use_proxy, use_cache=use_cache, options=extract_options + ) -class YouTubePlaylistUtils(YouTubeUtils): - def __init__(self, id, alias='', cache_dir=''): +class YouTubePlaylistUtils(YouTubeUtils): + def __init__(self, id, alias="", cache_dir=""): """ Initializes YouTubePlaylistUtils object with id :param id: YouTube playlist ID @@ -494,9 +509,11 @@ def __init__(self, id, alias='', cache_dir=''): self.cache_dir = DEFAULT_YOUTUBE_CACHE_DIR else: self.cache_dir = cache_dir - self.cache_path = os.path.join(self.cache_dir, self.cachename + '.json') + self.cache_path = os.path.join(self.cache_dir, self.cachename + ".json") - def get_playlist_info(self, use_proxy=True, use_cache=True, youtube_skip_download=True, options=None): + def get_playlist_info( + self, use_proxy=True, use_cache=True, youtube_skip_download=True, options=None + ): """ Get YouTube playlist info by either requesting URL or extracting local cache :param use_cache: Define if allowed to get playlist info from local JSON cache, default to True @@ -506,9 +523,10 @@ def get_playlist_info(self, use_proxy=True, use_cache=True, youtube_skip_downloa :return: A ricecooker-like info dict info about the playlist or None if extraction fails """ youtube_extract_options = dict( - skip_download=youtube_skip_download, - extract_flat=True + skip_download=youtube_skip_download, extract_flat=True ) if options: youtube_extract_options.update(options) - return self._get_youtube_info(use_proxy=use_proxy, use_cache=use_cache, options=youtube_extract_options) + return self._get_youtube_info( + use_proxy=use_proxy, use_cache=use_cache, options=youtube_extract_options + ) diff --git a/ricecooker/utils/zip.py b/ricecooker/utils/zip.py index b7478ed9..1981852d 100644 --- a/ricecooker/utils/zip.py +++ b/ricecooker/utils/zip.py @@ -39,7 +39,9 @@ def create_predictable_zip(path, entrypoint=None): f.close() for root, directories, filenames in os.walk(path): - paths += [os.path.join(root, filename)[len(path)+1:] for filename in filenames] + paths += [ + os.path.join(root, filename)[len(path) + 1 :] for filename in filenames + ] reader = lambda x: _read_file(os.path.join(path, x)) # otherwise, if it's a zip file, open it up and pull out the list of names elif os.path.isfile(path) and os.path.splitext(path)[1] == ".zip": @@ -55,7 +57,9 @@ def create_predictable_zip(path, entrypoint=None): with zipfile.ZipFile(zippath, "w") as outputzip: # loop over the file paths in sorted order, to ensure a predictable zip for filepath in sorted(paths): - write_file_to_zip_with_neutral_metadata(outputzip, filepath, reader(filepath)) + write_file_to_zip_with_neutral_metadata( + outputzip, filepath, reader(filepath) + ) os.fdopen(zippathfd).close() return zippath @@ -74,4 +78,3 @@ def write_file_to_zip_with_neutral_metadata(zfile, filename, content): info.comment = "".encode() info.create_system = 0 zfile.writestr(info, content) - diff --git a/setup.py b/setup.py index ed24e053..a94fd017 100644 --- a/setup.py +++ b/setup.py @@ -6,23 +6,23 @@ import ricecooker -readme = open('README.md').read() +readme = open("README.md").read() -with open('docs/history.rst') as history_file: +with open("docs/history.rst") as history_file: history = history_file.read() requirements = [ "pytest>=3.0.2", "requests>=2.11.1", "le_utils>=0.1.26", - "validators", # TODO: check if this is necessary + "validators", # TODO: check if this is necessary "requests_file", - "beautifulsoup4>=4.6.3,<4.9.0", # pinned to match versions in le-pycaption + "beautifulsoup4>=4.6.3,<4.9.0", # pinned to match versions in le-pycaption "selenium==3.0.1", "youtube-dl>=2020.6.16.1", "html5lib", "cachecontrol==0.12.0", - "lockfile==0.12.2", # TODO: check if this is necessary + "lockfile==0.12.2", # TODO: check if this is necessary "css-html-js-minify==2.2.2", "mock==2.0.0", "pypdf2>=1.26.0", @@ -40,36 +40,36 @@ setup( - name='ricecooker', + name="ricecooker", version=ricecooker.__version__, description="API for adding content to the Kolibri content curation server", - long_description=readme + '\n\n' + history, - long_description_content_type='text/markdown', + long_description=readme + "\n\n" + history, + long_description_content_type="text/markdown", author="Learning Equality", - author_email='dev@learningequality.org', - url='https://github.com/learningequality/ricecooker', + author_email="dev@learningequality.org", + url="https://github.com/learningequality/ricecooker", packages=find_packages(), - package_dir={'ricecooker':'ricecooker'}, - entry_points = { - 'console_scripts': [ - 'corrections = ricecooker.utils.corrections:correctionsmain', - 'jiro = ricecooker.cli:main' + package_dir={"ricecooker": "ricecooker"}, + entry_points={ + "console_scripts": [ + "corrections = ricecooker.utils.corrections:correctionsmain", + "jiro = ricecooker.cli:main", ], }, include_package_data=True, install_requires=requirements, license="MIT license", zip_safe=False, - keywords='ricecooker', + keywords="ricecooker", classifiers=[ - 'Intended Audience :: Developers', - 'Development Status :: 5 - Production/Stable', - 'License :: OSI Approved :: MIT License', - 'Programming Language :: Python :: 3.6', - 'Programming Language :: Python :: 3.7', - 'Programming Language :: Python :: 3.8', - 'Natural Language :: English', - 'Topic :: Education', + "Intended Audience :: Developers", + "Development Status :: 5 - Production/Stable", + "License :: OSI Approved :: MIT License", + "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Natural Language :: English", + "Topic :: Education", ], - test_suite='tests', + test_suite="tests", ) diff --git a/tests/conftest.py b/tests/conftest.py index 57d5d6e4..8e1f65ed 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,57 +1,86 @@ import copy import glob import os -import pytest -import requests import uuid import zipfile -from le_utils.constants import licenses, content_kinds, exercises, roles -from ricecooker.__init__ import __version__ -from ricecooker.classes.files import AudioFile, DocumentFile, EPubFile, HTMLZipFile, ThumbnailFile, SlideImageFile, SubtitleFile, VideoFile -from ricecooker.classes.files import _ExerciseImageFile, _ExerciseBase64ImageFile, _ExerciseGraphieFile -from ricecooker.classes.nodes import AudioNode, ChannelNode, DocumentNode, ExerciseNode, HTML5AppNode, SlideshowNode, TopicNode, VideoNode -from ricecooker.classes.questions import InputQuestion, SingleSelectQuestion +import pytest +import requests +from le_utils.constants import content_kinds +from le_utils.constants import exercises +from le_utils.constants import licenses +from le_utils.constants import roles +from ricecooker.__init__ import __version__ +from ricecooker.classes.files import _ExerciseBase64ImageFile +from ricecooker.classes.files import _ExerciseGraphieFile +from ricecooker.classes.files import _ExerciseImageFile +from ricecooker.classes.files import AudioFile +from ricecooker.classes.files import DocumentFile +from ricecooker.classes.files import EPubFile +from ricecooker.classes.files import HTMLZipFile +from ricecooker.classes.files import SlideImageFile +from ricecooker.classes.files import SubtitleFile +from ricecooker.classes.files import ThumbnailFile +from ricecooker.classes.files import VideoFile +from ricecooker.classes.nodes import AudioNode +from ricecooker.classes.nodes import ChannelNode +from ricecooker.classes.nodes import DocumentNode +from ricecooker.classes.nodes import ExerciseNode +from ricecooker.classes.nodes import HTML5AppNode +from ricecooker.classes.nodes import SlideshowNode +from ricecooker.classes.nodes import TopicNode +from ricecooker.classes.nodes import VideoNode +from ricecooker.classes.questions import InputQuestion +from ricecooker.classes.questions import SingleSelectQuestion # GLOBAL TEST SETUP/TEARDOWN UTILS ################################################################################ + def pytest_sessionfinish(session, exitstatus): """ Cleanup testcontent/generated/ directory after each test run is finished. """ generated_path = os.path.join("tests", "testcontent", "generated") - for path in glob.glob(generated_path + os.path.sep + '*'): + for path in glob.glob(generated_path + os.path.sep + "*"): os.remove(path) # CHANNEL FIXTURES ################################################################################ + @pytest.fixture def domain_namespace(): return "testing.learningequality.org" + @pytest.fixture def channel_source_id(): return "channel-id" + @pytest.fixture def channel_domain_namespace(domain_namespace): return uuid.uuid5(uuid.NAMESPACE_DNS, domain_namespace) + @pytest.fixture def channel_node_id(channel_domain_namespace, channel_source_id): return uuid.uuid5(channel_domain_namespace, channel_source_id) + @pytest.fixture def channel_content_id(channel_domain_namespace, channel_node_id): return uuid.uuid5(channel_domain_namespace, channel_node_id.hex) + @pytest.fixture -def channel_data(channel_node_id, channel_content_id, domain_namespace, channel_source_id): +def channel_data( + channel_node_id, channel_content_id, domain_namespace, channel_source_id +): return { "id": channel_node_id.hex, "name": "Channel", @@ -67,34 +96,31 @@ def channel_data(channel_node_id, channel_content_id, domain_namespace, channel_ "extra_fields": "{}", } + @pytest.fixture def channel(domain_namespace, channel_source_id, channel_data): channel = ChannelNode( channel_source_id, domain_namespace, - title=channel_data['name'], - description=channel_data['description'], - tagline=channel_data['tagline'], - language=channel_data['language'] + title=channel_data["name"], + description=channel_data["description"], + tagline=channel_data["tagline"], + language=channel_data["language"], ) return channel + @pytest.fixture def invalid_channel(channel_source_id, domain_namespace): - channel = ChannelNode( - channel_source_id, - domain_namespace, - title='Invalid Channel' - ) + channel = ChannelNode(channel_source_id, domain_namespace, title="Invalid Channel") channel.source_id = None return channel - - # ID, ARGS, AND KWARGS FIXTURE HELPERS ################################################################################ + @pytest.fixture def base_data(channel_domain_namespace, title): """ @@ -108,15 +134,15 @@ def base_data(channel_domain_namespace, title): "description": "Description", "author": "Author", "source_domain": channel_domain_namespace.hex, - "files" : [], + "files": [], "tags": [], "questions": [], "extra_fields": {}, # dict as input kwarg, but json.dumps-ed in to_dict "license": None, "copyright_holder": "", "license_description": None, - "aggregator": "", # New in ricecooker 0.6.20 - "provider": "", # New in ricecooker 0.6.20 + "aggregator": "", # New in ricecooker 0.6.20 + "provider": "", # New in ricecooker 0.6.20 } @@ -135,38 +161,38 @@ def genrate_random_ids(channel_domain_namespace, channel_node_id): return ids_dict - - # TOPIC FIXTURES ################################################################################ + def get_topic_node_args(node_data): """ Returns (source_id, title) from node_data dictionary. """ node_data = copy.deepcopy(node_data) - source_id = node_data.pop('source_id') - title = node_data.pop('title') - license = node_data.pop('license') + source_id = node_data.pop("source_id") + title = node_data.pop("title") + license = node_data.pop("license") return source_id, title + def get_topic_node_kwargs_data(node_data): """ Returns all keywords data other than source_id, title, and license. """ node_data = copy.deepcopy(node_data) - del node_data['source_id'] - del node_data['title'] + del node_data["source_id"] + del node_data["title"] # the following attributes will appear in `to_dict` method, but we don't need # to pass them in when creating a TopicNode - del node_data['content_id'] - del node_data['node_id'] - del node_data['kind'] - del node_data['source_domain'] - del node_data['questions'] - del node_data['license'] - del node_data['license_description'] - del node_data['copyright_holder'] + del node_data["content_id"] + del node_data["node_id"] + del node_data["kind"] + del node_data["source_domain"] + del node_data["questions"] + del node_data["license"] + del node_data["license_description"] + del node_data["copyright_holder"] return node_data @@ -180,9 +206,10 @@ def topic_data(base_data, channel_domain_namespace, channel_node_id): topic_data = copy.deepcopy(base_data) ids_dict = genrate_random_ids(channel_domain_namespace, channel_node_id) topic_data.update(ids_dict) - topic_data.update({ "kind": content_kinds.TOPIC }) + topic_data.update({"kind": content_kinds.TOPIC}) return topic_data + @pytest.fixture def topic(channel, title, topic_data): args_data = get_topic_node_args(topic_data) @@ -195,16 +222,21 @@ def topic(channel, title, topic_data): # CONTENT NODE FIXTURES ################################################################################ + @pytest.fixture def contentnode_base_data(base_data): """ Shared data for all ContentNode fixtures. """ data = copy.deepcopy(base_data) - data.update({ "license": licenses.CC_BY, - "copyright_holder": "Copyright Holder", - "license_description": None, - "role": roles.LEARNER}) + data.update( + { + "license": licenses.CC_BY, + "copyright_holder": "Copyright Holder", + "license_description": None, + "role": roles.LEARNER, + } + ) return data @@ -213,9 +245,9 @@ def get_content_node_args(node_data): Returns (source_id, title, license) from node_data dictionary. """ node_data = copy.deepcopy(node_data) - source_id = node_data.pop('source_id') - title = node_data.pop('title') - license = node_data.pop('license') + source_id = node_data.pop("source_id") + title = node_data.pop("title") + license = node_data.pop("license") return source_id, title, license @@ -224,16 +256,16 @@ def get_content_node_kwargs(node_data): Returns all keywords data other than source_id, title, and license. """ node_data = copy.deepcopy(node_data) - del node_data['source_id'] - del node_data['title'] - del node_data['license'] + del node_data["source_id"] + del node_data["title"] + del node_data["license"] # below are vars from internal representation - del node_data['content_id'] - del node_data['node_id'] - del node_data['kind'] - del node_data['source_domain'] - del node_data['questions'] - node_data['extra_fields'] = {} + del node_data["content_id"] + del node_data["node_id"] + del node_data["kind"] + del node_data["source_domain"] + del node_data["questions"] + node_data["extra_fields"] = {} return node_data @@ -241,57 +273,63 @@ def get_content_node_kwargs(node_data): def base_file_path(): return "test/file/path" + @pytest.fixture def contentnode_invalid_license(video): video = copy.deepcopy(video) video.license = None return video + @pytest.fixture def contentnode_invalid_files(video): video = copy.deepcopy(video) video.files = [] return video + @pytest.fixture def contentnode_no_source_id(title): - topic = TopicNode('some source id', title) + topic = TopicNode("some source id", title) topic.source_id = None return topic - - - # VIDEO FIXTURES ################################################################################ + @pytest.fixture -def video_file(): # uses same file as test_videos.low_res_video fixture - source_url = "https://archive.org/download/vd_is_for_everybody/vd_is_for_everybody_512kb.mp4" +def video_file(): # uses same file as test_videos.low_res_video fixture + source_url = ( + "https://archive.org/download/vd_is_for_everybody/vd_is_for_everybody_512kb.mp4" + ) local_path = os.path.join("tests", "testcontent", "downloaded", "low_res_video.mp4") download_fixture_file(source_url, local_path) assert os.path.exists(local_path) return VideoFile(local_path) + @pytest.fixture def video_filename(): - return '897d83a2e5389d454d37feb574587516.mp4' + return "897d83a2e5389d454d37feb574587516.mp4" + @pytest.fixture def subtitle_file(): local_path = os.path.join("tests", "testcontent", "generated", "testsubtitles.vtt") if not os.path.exists(local_path): - with open(local_path, 'wb') as subtitlefile: - subtitlefile.write(b'WEBVTT\n') - subtitlefile.write(b'\n') - subtitlefile.write(b'00:01.000 --> 00:04.250\n') - subtitlefile.write(b'Testing subtitles\n') - return SubtitleFile(local_path, language='en') + with open(local_path, "wb") as subtitlefile: + subtitlefile.write(b"WEBVTT\n") + subtitlefile.write(b"\n") + subtitlefile.write(b"00:01.000 --> 00:04.250\n") + subtitlefile.write(b"Testing subtitles\n") + return SubtitleFile(local_path, language="en") + @pytest.fixture def subtitle_filename(): - return '19faefeb0b8b8289923dc0c1c5adb7e5.vtt' + return "19faefeb0b8b8289923dc0c1c5adb7e5.vtt" @pytest.fixture @@ -299,9 +337,10 @@ def video_data(contentnode_base_data, channel_domain_namespace, channel_node_id) video_data = copy.deepcopy(contentnode_base_data) ids_dict = genrate_random_ids(channel_domain_namespace, channel_node_id) video_data.update(ids_dict) - video_data.update({ "kind": content_kinds.VIDEO }) + video_data.update({"kind": content_kinds.VIDEO}) return video_data + @pytest.fixture def video(video_file, video_data, channel): args_data = get_content_node_args(video_data) @@ -309,26 +348,29 @@ def video(video_file, video_data, channel): video = VideoNode(*args_data, **contentnode_kwargs) video.add_file(video_file) channel.add_child(video) - video_data['files'].append(video_file) # save it so we can compare later + video_data["files"].append(video_file) # save it so we can compare later return video + @pytest.fixture def video_invalid_files(video_data, document_file): args_data = get_content_node_args(video_data) contentnode_kwargs = get_content_node_kwargs(video_data) - contentnode_kwargs['files'] = [] # clear files becuse added one above + contentnode_kwargs["files"] = [] # clear files becuse added one above video = VideoNode(*args_data, **contentnode_kwargs) video.add_file(document_file) return video + @pytest.fixture def invalid_video_file(): local_path = os.path.join("tests", "testcontent", "generated", "invalid_video.mp4") if not os.path.exists(local_path): - with open(local_path, 'wb') as f: - f.write(b'this is an invalid video file') + with open(local_path, "wb") as f: + f.write(b"this is an invalid video file") return DocumentFile(local_path) + @pytest.fixture def youtube_video_dict(): """ @@ -336,6 +378,7 @@ def youtube_video_dict(): """ return {"youtube_id": "C0DPdy98e4c"} + @pytest.fixture def youtube_video_with_subs_dict(): """ @@ -343,34 +386,53 @@ def youtube_video_with_subs_dict(): """ return { "youtube_id": "USq6DX7byoY", - "subtitles_langs": ["nl", "en", "en-GB", "fr", "el", "hu", "it", "pt", "ro", "es"] + "subtitles_langs": [ + "nl", + "en", + "en-GB", + "fr", + "el", + "hu", + "it", + "pt", + "ro", + "es", + ], } + # AUDIO FIXTURES ################################################################################ + @pytest.fixture def audio_file(): - source_url = "https://ia800103.us.archive.org/9/items/cd_prince_prince/" \ - "disc1/02.%20Prince%20-%201999%20%28Edit%29_sample.mp3" + source_url = ( + "https://ia800103.us.archive.org/9/items/cd_prince_prince/" + "disc1/02.%20Prince%20-%201999%20%28Edit%29_sample.mp3" + ) local_path = os.path.join("tests", "testcontent", "downloaded", "testaudio.mp3") download_fixture_file(source_url, local_path) assert os.path.exists(local_path) return AudioFile(local_path) + @pytest.fixture def audio_filename(): - return 'c335e8044ecf583c690d5d8c65d68627.mp3' + return "c335e8044ecf583c690d5d8c65d68627.mp3" @pytest.fixture -def audio_data(contentnode_base_data, audio_file, channel_domain_namespace, channel_node_id): +def audio_data( + contentnode_base_data, audio_file, channel_domain_namespace, channel_node_id +): audio_data = copy.deepcopy(contentnode_base_data) ids_dict = genrate_random_ids(channel_domain_namespace, channel_node_id) audio_data.update(ids_dict) - audio_data.update({ "kind": content_kinds.AUDIO }) + audio_data.update({"kind": content_kinds.AUDIO}) return audio_data + @pytest.fixture def audio(audio_file, audio_data, channel): args_data = get_content_node_args(audio_data) @@ -378,29 +440,33 @@ def audio(audio_file, audio_data, channel): audio = AudioNode(*args_data, **contentnode_kwargs) audio.add_file(audio_file) channel.add_child(audio) - audio_data['files'].append(audio_file) # save it so we can compare later + audio_data["files"].append(audio_file) # save it so we can compare later return audio + @pytest.fixture def audio_invalid_files(audio_data, document_file): args_data = get_content_node_args(audio_data) contentnode_kwargs = get_content_node_kwargs(audio_data) - contentnode_kwargs['files'] = [] # clear files because added one above + contentnode_kwargs["files"] = [] # clear files because added one above audio = AudioNode(*args_data, **contentnode_kwargs) audio.add_file(document_file) return audio + @pytest.fixture def invalid_audio_file(): local_path = os.path.join("tests", "testcontent", "generated", "invalid_audio.mp3") if not os.path.exists(local_path): - with open(local_path, 'wb') as f: - f.write(b'invalid MP3') + with open(local_path, "wb") as f: + f.write(b"invalid MP3") return DocumentFile(local_path) + # DOCUMENT FIXTURES ################################################################################ + @pytest.fixture def document_file(): source_url = "https://ia802506.us.archive.org/8/items/generalmanual_000075878/generalmanual_000075878.pdf" @@ -409,18 +475,23 @@ def document_file(): assert os.path.exists(local_path) return DocumentFile(local_path) + @pytest.fixture def document_filename(): - return 'b976c31a7ab68a97f12541d661245238.pdf' + return "b976c31a7ab68a97f12541d661245238.pdf" + @pytest.fixture -def document_data(contentnode_base_data, document_file, channel_domain_namespace, channel_node_id): +def document_data( + contentnode_base_data, document_file, channel_domain_namespace, channel_node_id +): document_data = copy.deepcopy(contentnode_base_data) ids_dict = genrate_random_ids(channel_domain_namespace, channel_node_id) document_data.update(ids_dict) - document_data.update({ "kind": content_kinds.DOCUMENT }) + document_data.update({"kind": content_kinds.DOCUMENT}) return document_data + @pytest.fixture def document(document_file, document_data, channel): args_data = get_content_node_args(document_data) @@ -428,14 +499,15 @@ def document(document_file, document_data, channel): document = DocumentNode(*args_data, **contentnode_kwargs) document.add_file(document_file) channel.add_child(document) - document_data['files'].append(document_file) # save it so we can compare later + document_data["files"].append(document_file) # save it so we can compare later return document + @pytest.fixture def document_invalid_files(document_data, audio_file): args_data = get_content_node_args(document_data) contentnode_kwargs = get_content_node_kwargs(document_data) - contentnode_kwargs['files'] = [] # clear files becuse added one above + contentnode_kwargs["files"] = [] # clear files becuse added one above document = DocumentNode(*args_data, **contentnode_kwargs) document.add_file(audio_file) return document @@ -447,53 +519,66 @@ def epub_file(): assert os.path.exists(path) return EPubFile(path) + @pytest.fixture def epub_filename(): - return '5f91b55a7648206343b609cae692e08c.epub' + return "5f91b55a7648206343b609cae692e08c.epub" @pytest.fixture def invalid_document_file(): - local_path = os.path.join("tests", "testcontent", "generated", "invalid_document.pdf") + local_path = os.path.join( + "tests", "testcontent", "generated", "invalid_document.pdf" + ) if not os.path.exists(local_path): - with open(local_path, 'wb') as f: - f.write(b'invalid PDF') + with open(local_path, "wb") as f: + f.write(b"invalid PDF") return DocumentFile(local_path) + @pytest.fixture def invalid_epub_file(): - local_path = os.path.join("tests", "testcontent", "generated", "invalid_document.epub") + local_path = os.path.join( + "tests", "testcontent", "generated", "invalid_document.epub" + ) if not os.path.exists(local_path): - with open(local_path, 'wb') as f: - f.write(b'invalid ePub') + with open(local_path, "wb") as f: + f.write(b"invalid ePub") return EPubFile(local_path) # HTML FIXTURES ################################################################################ + @pytest.fixture def html_file(): - source_url = "https://studio.learningequality.org/content/storage/" \ - "e/d/ed494d6547b603b8ff22095cf5f5b624.zip" + source_url = ( + "https://studio.learningequality.org/content/storage/" + "e/d/ed494d6547b603b8ff22095cf5f5b624.zip" + ) local_path = os.path.join("tests", "testcontent", "downloaded", "testhtml.zip") download_fixture_file(source_url, local_path) assert os.path.exists(local_path) return HTMLZipFile(local_path) + @pytest.fixture def html_filename(): - return 'ed494d6547b603b8ff22095cf5f5b624.zip' + return "ed494d6547b603b8ff22095cf5f5b624.zip" @pytest.fixture -def html_data(contentnode_base_data, html_file, channel_domain_namespace, channel_node_id): +def html_data( + contentnode_base_data, html_file, channel_domain_namespace, channel_node_id +): html_data = copy.deepcopy(contentnode_base_data) ids_dict = genrate_random_ids(channel_domain_namespace, channel_node_id) html_data.update(ids_dict) - html_data.update({ "kind": content_kinds.HTML5 }) + html_data.update({"kind": content_kinds.HTML5}) return html_data + @pytest.fixture def html(html_file, html_data, channel): args_data = get_content_node_args(html_data) @@ -501,7 +586,7 @@ def html(html_file, html_data, channel): html = HTML5AppNode(*args_data, **contentnode_kwargs) html.add_file(html_file) channel.add_child(html) - html_data['files'].append(html_file) # save it so we can compare later + html_data["files"].append(html_file) # save it so we can compare later return html @@ -512,7 +597,7 @@ def html_invalid_files(html_data, document_file): """ args_data = get_content_node_args(html_data) contentnode_kwargs = get_content_node_kwargs(html_data) - contentnode_kwargs['files'] = [] # clear files becuse added one above + contentnode_kwargs["files"] = [] # clear files becuse added one above html = HTML5AppNode(*args_data, **contentnode_kwargs) html.add_file(document_file) return html @@ -520,58 +605,75 @@ def html_invalid_files(html_data, document_file): @pytest.fixture def html_invalid_file(): - local_path = os.path.join("tests", "testcontent", "generated", "testinvalidhtml.zip") + local_path = os.path.join( + "tests", "testcontent", "generated", "testinvalidhtml.zip" + ) if not os.path.exists(local_path): - with zipfile.ZipFile(local_path, 'w', zipfile.ZIP_DEFLATED) as archive: - archive.writestr("notindex.html", '
') + with zipfile.ZipFile(local_path, "w", zipfile.ZIP_DEFLATED) as archive: + archive.writestr("notindex.html", "
") return HTMLZipFile(local_path) + @pytest.fixture def html_invalid_zip(html_data, html_invalid_file): args_data = get_content_node_args(html_data) contentnode_kwargs = get_content_node_kwargs(html_data) - contentnode_kwargs['files'] = [] # clear files because added one above + contentnode_kwargs["files"] = [] # clear files because added one above html = HTML5AppNode(*args_data, **contentnode_kwargs) html.add_file(html_invalid_file) return html - # EXERCISE FIXTURES ################################################################################ + @pytest.fixture def exercise_question(): return SingleSelectQuestion("question_1", "Question", "Answer", ["Answer"]) + @pytest.fixture def mastery_model(): - return {'mastery_model': exercises.M_OF_N, 'randomize': True, 'm': 1, 'n': 1} + return {"mastery_model": exercises.M_OF_N, "randomize": True, "m": 1, "n": 1} + @pytest.fixture -def exercise_data(contentnode_base_data, mastery_model, exercise_question, channel_domain_namespace, channel_node_id): +def exercise_data( + contentnode_base_data, + mastery_model, + exercise_question, + channel_domain_namespace, + channel_node_id, +): exercise_data = copy.deepcopy(contentnode_base_data) ids_dict = genrate_random_ids(channel_domain_namespace, channel_node_id) exercise_data.update(ids_dict) - exercise_data.update({ "kind": content_kinds.EXERCISE, - "questions":[], - "exercise_data": mastery_model}) + exercise_data.update( + { + "kind": content_kinds.EXERCISE, + "questions": [], + "exercise_data": mastery_model, + } + ) return exercise_data + @pytest.fixture def exercise(exercise_question, exercise_data, channel): args_data = get_content_node_args(exercise_data) contentnode_kwargs = get_content_node_kwargs(exercise_data) - del contentnode_kwargs['extra_fields'] - mastery_model_dict = contentnode_kwargs['exercise_data'] + del contentnode_kwargs["extra_fields"] + mastery_model_dict = contentnode_kwargs["exercise_data"] exercise = ExerciseNode(*args_data, **contentnode_kwargs) exercise.add_question(exercise_question) channel.add_child(exercise) - exercise_data['questions'] = [exercise_question] - exercise_data['extra_fields'] = mastery_model_dict - del exercise_data['exercise_data'] + exercise_data["questions"] = [exercise_question] + exercise_data["extra_fields"] = mastery_model_dict + del exercise_data["exercise_data"] return exercise + @pytest.fixture def exercise_invalid_question(exercise): exercise = copy.deepcopy(exercise) @@ -579,122 +681,131 @@ def exercise_invalid_question(exercise): return exercise - # THUMBNAIL FILE FIXTURES ################################################################################ + @pytest.fixture def thumbnail_file(): local_path = os.path.join("tests", "testcontent", "samples", "thumbnail.png") assert os.path.exists(local_path) return ThumbnailFile(local_path) + @pytest.fixture def thumbnail_filename(): - return 'eb79354ddd5774bb3436f9a19c282bff.png' + return "eb79354ddd5774bb3436f9a19c282bff.png" + @pytest.fixture def fake_thumbnail_file(): local_path = os.path.join("tests", "testcontent", "generated", "invalidimage.png") if not os.path.exists(local_path): - with open(local_path, 'wb') as imgfile: - imgfile.write(b'not_a_valid_PNG') + with open(local_path, "wb") as imgfile: + imgfile.write(b"not_a_valid_PNG") return ThumbnailFile(local_path) - # EXERCISE IMAGES FIXTURES ################################################################################ + @pytest.fixture def exercise_image_file(): - return _ExerciseImageFile('tests/testcontent/exercises/no-wifi.png') + return _ExerciseImageFile("tests/testcontent/exercises/no-wifi.png") + @pytest.fixture def exercise_image_filename(): - return '599aa896313be22dea6c0257772a464e.png' + return "599aa896313be22dea6c0257772a464e.png" @pytest.fixture def exercise_base64_image_file(): - with open('tests/testcontent/exercises/test_image_base64.data') as datafile: + with open("tests/testcontent/exercises/test_image_base64.data") as datafile: base64_data = datafile.read() return _ExerciseBase64ImageFile(base64_data) + @pytest.fixture def exercise_base64_image_filename(): - return 'cd9635def904486701e7705ef29ece67.png' + return "cd9635def904486701e7705ef29ece67.png" @pytest.fixture def exercise_graphie_file(): - return _ExerciseGraphieFile('tests/testcontent/exercises/eb3f3bf7c317408ee90995b5bcf4f3a59606aedd') + return _ExerciseGraphieFile( + "tests/testcontent/exercises/eb3f3bf7c317408ee90995b5bcf4f3a59606aedd" + ) + @pytest.fixture def exercise_graphie_replacement_str(): - return 'eb3f3bf7c317408ee90995b5bcf4f3a59606aedd' + return "eb3f3bf7c317408ee90995b5bcf4f3a59606aedd" + @pytest.fixture def exercise_graphie_filename(): - return 'ea2269bb5cf487f8d883144b9c06fbc7.graphie' - - + return "ea2269bb5cf487f8d883144b9c06fbc7.graphie" # SLIDESHOW IMAGES FIXTURES ################################################################################ + @pytest.fixture def slideshow_files(): fake_files = [] - for i in range(0,10): - filename = 'tests/testcontent/generated/slide' + str(i) + '.jpg' + for i in range(0, 10): + filename = "tests/testcontent/generated/slide" + str(i) + ".jpg" if not os.path.exists(filename): - with open(filename, 'w') as f: - f.write('jpgdatawouldgohere' + str(i)) - fake_files.append( - SlideImageFile(filename, caption='slide ' + str(i)) - ) + with open(filename, "w") as f: + f.write("jpgdatawouldgohere" + str(i)) + fake_files.append(SlideImageFile(filename, caption="slide " + str(i))) return fake_files + @pytest.fixture -def slideshow_data(contentnode_base_data, slideshow_files, channel_domain_namespace, channel_node_id): +def slideshow_data( + contentnode_base_data, slideshow_files, channel_domain_namespace, channel_node_id +): slideshow_data = copy.deepcopy(contentnode_base_data) ids_dict = genrate_random_ids(channel_domain_namespace, channel_node_id) slideshow_data.update(ids_dict) - slideshow_data.update({ "kind": content_kinds.SLIDESHOW }) + slideshow_data.update({"kind": content_kinds.SLIDESHOW}) # TODO setup expected extra_fields['slideshow_data'] return slideshow_data + @pytest.fixture def slideshow(slideshow_files, slideshow_data, channel): args_data = get_content_node_args(slideshow_data) contentnode_kwargs = get_content_node_kwargs(slideshow_data) - del contentnode_kwargs['extra_fields'] + del contentnode_kwargs["extra_fields"] slideshow = SlideshowNode(*args_data, **contentnode_kwargs) for slideshow_file in slideshow_files: slideshow.add_file(slideshow_file) channel.add_child(slideshow) - slideshow_data['files'] = slideshow_files # save it so we can compare later + slideshow_data["files"] = slideshow_files # save it so we can compare later return slideshow - # FIXTURE DOWNLOADING UTILS ################################################################################ + def download_fixture_file(source_url, local_path): """ Download fixture file `source_url` to `local_path` if not present already. """ if os.path.exists(local_path): return - with open(local_path, 'wb') as f: + with open(local_path, "wb") as f: response = requests.get(source_url, stream=True) - assert response.status_code == 200, "Fixture file with url: {} not found".format(source_url) + assert ( + response.status_code == 200 + ), "Fixture file with url: {} not found".format(source_url) for chunk in response.iter_content(chunk_size=1048576): f.write(chunk) f.flush() f.close() - - diff --git a/tests/media_utils/README.md b/tests/media_utils/README.md index 4c5d861d..df8c8280 100644 --- a/tests/media_utils/README.md +++ b/tests/media_utils/README.md @@ -26,15 +26,15 @@ Various media processing functions and utilities - vendored from the previously ## Converting caption files This contains utilities for converting caption files from a few various formats into the preferred `VTT` format. The currently supported formats include: -- [DFXP](https://en.wikipedia.org/wiki/Timed_Text_Markup_Language) +- [DFXP](https://en.wikipedia.org/wiki/Timed_Text_Markup_Language) - [SAMI](https://en.wikipedia.org/wiki/SAMI) - [SCC](http://www.theneitherworld.com/mcpoodle/SCC_TOOLS/DOCS/SCC_FORMAT.HTML) -- [SRT](https://en.wikipedia.org/wiki/SubRip) +- [SRT](https://en.wikipedia.org/wiki/SubRip) - [TTML](https://en.wikipedia.org/wiki/Timed_Text_Markup_Language) - [WebVTT or just VTT](https://en.wikipedia.org/wiki/WebVTT) -> Within `ricecooker`, the term "captions" and "subtitles" are used interchangeably. All of the -classes and functions handling conversion use the "subtitles" term. +> Within `ricecooker`, the term "captions" and "subtitles" are used interchangeably. All of the +classes and functions handling conversion use the "subtitles" term. ### Language codes @@ -98,7 +98,7 @@ converter = build_subtitle_converter_from_file('/path/to/file') # Replace unknown language code if present if converter.has_language(LANGUAGE_CODE_UNKNOWN): converter.replace_unknown_language('en') - + assert converter.has_language('en'), 'Must have English after replace' output_str = converter.convert('en') @@ -119,6 +119,3 @@ for lang_code in converter.get_language_codes(): elif lang_code == LANGUAGE_CODE_UNKNOWN: raise InvalidSubtitleLanguageError('Unexpected unknown language') ``` - - - diff --git a/tests/media_utils/files/assets/images/copyright.txt b/tests/media_utils/files/assets/images/copyright.txt index fb3f2a64..3ac9a66b 100644 --- a/tests/media_utils/files/assets/images/copyright.txt +++ b/tests/media_utils/files/assets/images/copyright.txt @@ -1,4 +1,3 @@ File: 4933759886_098e9acf93_m.jpg Source: https://flic.kr/p/8vYNVC License: CC BY 2.0 - diff --git a/tests/media_utils/files/page_with_links.html b/tests/media_utils/files/page_with_links.html index 4cd05206..cb728a66 100644 --- a/tests/media_utils/files/page_with_links.html +++ b/tests/media_utils/files/page_with_links.html @@ -26,4 +26,4 @@

jQuery for ubernerds, chapter 1.

- \ No newline at end of file + diff --git a/tests/media_utils/files/subtitles/basic.srt b/tests/media_utils/files/subtitles/basic.srt index 7155e2dc..34593702 100644 --- a/tests/media_utils/files/subtitles/basic.srt +++ b/tests/media_utils/files/subtitles/basic.srt @@ -30,7 +30,7 @@ 8 00:00:35,160 --> 00:00:41,200 -يعرف الحظ على أنه النجاح أو الفشل +يعرف الحظ على أنه النجاح أو الفشل سببه الصدفة على ما يبدو. 9 @@ -80,7 +80,7 @@ 20 00:01:09,947 --> 00:01:14,222 -وأحيانًا تأتي من اتجاهات +وأحيانًا تأتي من اتجاهات لم تكن لتتخيلها. 21 @@ -115,7 +115,7 @@ 28 00:01:38,101 --> 00:01:41,491 -حينما كنا أطفالًا، اعتدنا فعل ذلك +حينما كنا أطفالًا، اعتدنا فعل ذلك طوال الوقت. 29 @@ -133,7 +133,7 @@ 32 00:01:48,858 --> 00:01:51,816 -يجب أن تنتقل من شخص لا يعرف +يجب أن تنتقل من شخص لا يعرف كيف يركب دراجة فى أسبوعه الأول 33 @@ -204,17 +204,17 @@ 49 00:02:30,242 --> 00:02:33,893 -فهناك مخاطر فكرية، +فهناك مخاطر فكرية، ومخاطر مادية، ومخاطر مالية 50 00:02:33,897 --> 00:02:37,905 -ومخاطر عاطفية، ومخاطر اجتماعية، +ومخاطر عاطفية، ومخاطر اجتماعية، ومخاطر أخلاقية، ومخاطر سياسية. 51 00:02:37,929 --> 00:02:41,714 -وبمجرد الانتهاء من ذلك، يقارنون بيانات +وبمجرد الانتهاء من ذلك، يقارنون بيانات المخاطرة الخاصة بهم مع الآخرين. 52 @@ -321,7 +321,7 @@ 76 00:03:54,721 --> 00:03:57,493 -وقال "أتعلمين يا تينا، +وقال "أتعلمين يا تينا، هذا لن يجدي نفعًا لنا، 77 @@ -338,7 +338,7 @@ 80 00:04:03,051 --> 00:04:06,046 -وفي نهاية الرحلة تبادلنا +وفي نهاية الرحلة تبادلنا معلومات الاتصال الخاصة بنا. 81 @@ -404,7 +404,7 @@ 96 00:04:41,456 --> 00:04:44,481 -أعني أنه أراد أن ينشر كتابًا +أعني أنه أراد أن ينشر كتابًا مع طلابي وليس معي. 97 @@ -413,7 +413,7 @@ 98 00:04:45,680 --> 00:04:49,191 -بعد ذلك دعوته للمجيء، +بعد ذلك دعوته للمجيء، فجاء هو وزملاؤه إلى ستانفورد 99 @@ -467,7 +467,7 @@ 111 00:05:20,949 --> 00:05:25,273 -ولكن هذا الحظ كان نتيجة سلسلة +ولكن هذا الحظ كان نتيجة سلسلة من مجازفاتٍ صغيرةٍ قمت بها، 112 @@ -492,7 +492,7 @@ 117 00:05:36,985 --> 00:05:41,284 -يمكنكم فعل ذلك إذا غامرتم قليلًا +يمكنكم فعل ذلك إذا غامرتم قليلًا لتخرجوا من منطقة راحتكم. 118 @@ -546,7 +546,7 @@ 130 00:06:16,492 --> 00:06:20,761 -الآن، أنا أدير ثلاثة برامج زمالة +الآن، أنا أدير ثلاثة برامج زمالة في ستانفورد، 131 @@ -555,7 +555,7 @@ 132 00:06:22,906 --> 00:06:26,147 -وعندما أرسل رسالةً لمن تم استبعادهم +وعندما أرسل رسالةً لمن تم استبعادهم من الطلاب، 133 @@ -564,7 +564,7 @@ 134 00:06:29,186 --> 00:06:32,376 -فبعض من يشعر بخيبة الأمل +فبعض من يشعر بخيبة الأمل يرسلون لي الملاحظات والشكاوى. 135 @@ -582,7 +582,7 @@ 138 00:06:38,834 --> 00:06:42,173 -يرسل لي أحدهم ملاحظة يشكرني فيها +يرسل لي أحدهم ملاحظة يشكرني فيها على الفرصة. 139 @@ -673,7 +673,7 @@ 159 00:07:41,408 --> 00:07:44,917 -لكنها كانت الرياح التي لم نتوقعها +لكنها كانت الرياح التي لم نتوقعها من الأساس. 160 @@ -690,7 +690,7 @@ 163 00:07:54,107 --> 00:07:57,655 -والتكتيك المفضل لدي هو أنه +والتكتيك المفضل لدي هو أنه في نهاية كل يوم، 164 @@ -751,12 +751,12 @@ 177 00:08:37,898 --> 00:08:42,708 -فى الحقيقة، الأفكار المريعة +فى الحقيقة، الأفكار المريعة عادة ما تكون بذورها أشياء رائعة بحق. 178 00:08:44,039 --> 00:08:47,248 -فواحدة من تماريني المفضلة على الإبداع +فواحدة من تماريني المفضلة على الإبداع في فصولي 179 @@ -785,7 +785,7 @@ 185 00:09:06,611 --> 00:09:10,070 -فكانت أفضل الأفكار هي اقتراحات كبناء مطعم +فكانت أفضل الأفكار هي اقتراحات كبناء مطعم على قمة جبل 186 @@ -798,7 +798,7 @@ 188 00:09:14,573 --> 00:09:19,359 -أما عن الأفكار السيئة فهي أشياء مثل +أما عن الأفكار السيئة فهي أشياء مثل بناء مطعم في مكب للقمامة، 189 @@ -832,7 +832,7 @@ 196 00:09:40,159 --> 00:09:44,722 -وأصبح كل فريق معه فكرة ظن الفريق الآخر +وأصبح كل فريق معه فكرة ظن الفريق الآخر أنها مريعة، 197 @@ -885,7 +885,7 @@ 208 00:10:18,689 --> 00:10:22,408 -حسنًا، لقد تحول هذا إلى مطعم +حسنًا، لقد تحول هذا إلى مطعم بمثابة ساحة تدريب 209 @@ -903,7 +903,7 @@ 212 00:10:31,476 --> 00:10:34,559 -مع جميع أنواع المكونات المثيرة للإهتمام +مع جميع أنواع المكونات المثيرة للإهتمام والغريبة حقًا. 213 @@ -958,12 +958,12 @@ 225 00:11:17,163 --> 00:11:20,671 -إذا كنتم مستعدين للخروج حقًا +إذا كنتم مستعدين للخروج حقًا والتعبير عن التقدير، 226 00:11:20,695 --> 00:11:25,116 -ومستعدين للنظر بعناية للأفكار +ومستعدين للنظر بعناية للأفكار حتى وإن كانت مجنونة، 227 @@ -982,4 +982,3 @@ 230 00:11:32,867 --> 00:11:37,915 (تصفيق) - diff --git a/tests/media_utils/files/subtitles/basic.vtt b/tests/media_utils/files/subtitles/basic.vtt index 08423ace..3c9e115b 100644 --- a/tests/media_utils/files/subtitles/basic.vtt +++ b/tests/media_utils/files/subtitles/basic.vtt @@ -24,7 +24,7 @@ WEBVTT لذا، ما هو تعريف الحظ؟ 00:35.160 --> 00:41.200 -يعرف الحظ على أنه النجاح أو الفشل +يعرف الحظ على أنه النجاح أو الفشل سببه الصدفة على ما يبدو. 00:42.295 --> 00:43.682 @@ -62,7 +62,7 @@ WEBVTT وأحيانًا أخرى تعصف، 01:09.947 --> 01:14.222 -وأحيانًا تأتي من اتجاهات +وأحيانًا تأتي من اتجاهات لم تكن لتتخيلها. 01:14.222 --> 01:16.689 @@ -89,7 +89,7 @@ WEBVTT التي ستخرجكم من منطقة راحتكم. 01:38.101 --> 01:41.491 -حينما كنا أطفالًا، اعتدنا فعل ذلك +حينما كنا أطفالًا، اعتدنا فعل ذلك طوال الوقت. 01:42.459 --> 01:45.708 @@ -103,7 +103,7 @@ WEBVTT أو حتى دراسة ميكانيكا الكم، صحيح؟ 01:48.858 --> 01:51.816 -يجب أن تنتقل من شخص لا يعرف +يجب أن تنتقل من شخص لا يعرف كيف يركب دراجة فى أسبوعه الأول 01:51.820 --> 01:53.603 @@ -157,15 +157,15 @@ WEBVTT أن المخاطرة ليست أمرًا ثنائيًا. 02:30.242 --> 02:33.893 -فهناك مخاطر فكرية، +فهناك مخاطر فكرية، ومخاطر مادية، ومخاطر مالية 02:33.897 --> 02:37.905 -ومخاطر عاطفية، ومخاطر اجتماعية، +ومخاطر عاطفية، ومخاطر اجتماعية، ومخاطر أخلاقية، ومخاطر سياسية. 02:37.929 --> 02:41.714 -وبمجرد الانتهاء من ذلك، يقارنون بيانات +وبمجرد الانتهاء من ذلك، يقارنون بيانات المخاطرة الخاصة بهم مع الآخرين. 02:41.738 --> 02:45.040 @@ -247,7 +247,7 @@ WEBVTT وكان مهذبًا للغاية وقرأه، 03:54.721 --> 03:57.493 -وقال "أتعلمين يا تينا، +وقال "أتعلمين يا تينا، هذا لن يجدي نفعًا لنا، 03:57.507 --> 03:59.174 @@ -260,7 +260,7 @@ WEBVTT أغلقت حاسوبي. 04:03.051 --> 04:06.046 -وفي نهاية الرحلة تبادلنا +وفي نهاية الرحلة تبادلنا معلومات الاتصال الخاصة بنا. 04:07.594 --> 04:10.026 @@ -310,14 +310,14 @@ WEBVTT (ضحك) 04:41.456 --> 04:44.481 -أعني أنه أراد أن ينشر كتابًا +أعني أنه أراد أن ينشر كتابًا مع طلابي وليس معي. 04:44.505 --> 04:45.656 ولكن حسنًا، لا بأس. 04:45.680 --> 04:49.191 -بعد ذلك دعوته للمجيء، +بعد ذلك دعوته للمجيء، فجاء هو وزملاؤه إلى ستانفورد 04:49.191 --> 04:52.655 @@ -358,7 +358,7 @@ WEBVTT بالطبع كنت محظوظة، 05:20.949 --> 05:25.273 -ولكن هذا الحظ كان نتيجة سلسلة +ولكن هذا الحظ كان نتيجة سلسلة من مجازفاتٍ صغيرةٍ قمت بها، 05:25.297 --> 05:27.567 @@ -377,7 +377,7 @@ WEBVTT حتى وإن كنتم تعتقدون أنكم الأسوأ حظًا، 05:36.985 --> 05:41.284 -يمكنكم فعل ذلك إذا غامرتم قليلًا +يمكنكم فعل ذلك إذا غامرتم قليلًا لتخرجوا من منطقة راحتكم. 05:41.284 --> 05:43.627 @@ -418,21 +418,21 @@ WEBVTT ويجب أن تعترفوا بما يفعلونه. 06:16.492 --> 06:20.761 -الآن، أنا أدير ثلاثة برامج زمالة +الآن، أنا أدير ثلاثة برامج زمالة في ستانفورد، 06:20.785 --> 06:22.882 وهم يتنافسون بشراسة ليتمكنوا من الانضمام 06:22.906 --> 06:26.147 -وعندما أرسل رسالةً لمن تم استبعادهم +وعندما أرسل رسالةً لمن تم استبعادهم من الطلاب، 06:26.171 --> 06:29.182 أعرف دائما أن البعض منهم يخيب أملهم. 06:29.186 --> 06:32.376 -فبعض من يشعر بخيبة الأمل +فبعض من يشعر بخيبة الأمل يرسلون لي الملاحظات والشكاوى. 06:32.400 --> 06:34.003 @@ -446,7 +446,7 @@ WEBVTT ومن وقتٍ لآخر، 06:38.834 --> 06:42.173 -يرسل لي أحدهم ملاحظة يشكرني فيها +يرسل لي أحدهم ملاحظة يشكرني فيها على الفرصة. 06:42.880 --> 06:44.596 @@ -516,7 +516,7 @@ WEBVTT وكل ذلك نتيجة لملاحظة كتب فيها "شكرا لك". 07:41.408 --> 07:44.917 -لكنها كانت الرياح التي لم نتوقعها +لكنها كانت الرياح التي لم نتوقعها من الأساس. 07:46.145 --> 07:48.214 @@ -529,7 +529,7 @@ WEBVTT حتى تساعدني حقًا على ترسيخ فكرة التقدير. 07:54.107 --> 07:57.655 -والتكتيك المفضل لدي هو أنه +والتكتيك المفضل لدي هو أنه في نهاية كل يوم، 07:57.679 --> 08:01.435 @@ -576,11 +576,11 @@ WEBVTT فالأفكار ليست جيدة أو سيئة. 08:37.898 --> 08:42.708 -فى الحقيقة، الأفكار المريعة +فى الحقيقة، الأفكار المريعة عادة ما تكون بذورها أشياء رائعة بحق. 08:44.039 --> 08:47.248 -فواحدة من تماريني المفضلة على الإبداع +فواحدة من تماريني المفضلة على الإبداع في فصولي 08:47.262 --> 08:51.324 @@ -602,7 +602,7 @@ WEBVTT وأسوأ الأفكار لمطعم جديد. 09:06.611 --> 09:10.070 -فكانت أفضل الأفكار هي اقتراحات كبناء مطعم +فكانت أفضل الأفكار هي اقتراحات كبناء مطعم على قمة جبل 09:10.094 --> 09:11.629 @@ -612,7 +612,7 @@ WEBVTT أو مطعم على قارب يطل على منظر خلاب. 09:14.573 --> 09:19.359 -أما عن الأفكار السيئة فهي أشياء مثل +أما عن الأفكار السيئة فهي أشياء مثل بناء مطعم في مكب للقمامة، 09:19.383 --> 09:23.627 @@ -638,7 +638,7 @@ WEBVTT وقمت بإعادة توزيعها. 09:40.159 --> 09:44.722 -وأصبح كل فريق معه فكرة ظن الفريق الآخر +وأصبح كل فريق معه فكرة ظن الفريق الآخر أنها مريعة، 09:44.746 --> 09:47.563 @@ -679,7 +679,7 @@ WEBVTT أما عن المطعم صاحب الخدمة السيئة، 10:18.689 --> 10:22.408 -حسنًا، لقد تحول هذا إلى مطعم +حسنًا، لقد تحول هذا إلى مطعم بمثابة ساحة تدريب 10:22.432 --> 10:26.527 @@ -693,7 +693,7 @@ WEBVTT فقد تحول إلى مطعم للسوشي بالطبع 10:31.476 --> 10:34.559 -مع جميع أنواع المكونات المثيرة للإهتمام +مع جميع أنواع المكونات المثيرة للإهتمام والغريبة حقًا. 10:35.922 --> 10:39.010 @@ -735,11 +735,11 @@ WEBVTT وإذا كنتم مستعدين للمغامرة قليلًا، 11:17.163 --> 11:20.671 -إذا كنتم مستعدين للخروج حقًا +إذا كنتم مستعدين للخروج حقًا والتعبير عن التقدير، 11:20.695 --> 11:25.116 -ومستعدين للنظر بعناية للأفكار +ومستعدين للنظر بعناية للأفكار حتى وإن كانت مجنونة، 11:25.140 --> 11:26.909 diff --git a/tests/media_utils/files/subtitles/empty.ttml b/tests/media_utils/files/subtitles/empty.ttml index eeeaf3bd..2207a72c 100644 --- a/tests/media_utils/files/subtitles/empty.ttml +++ b/tests/media_utils/files/subtitles/empty.ttml @@ -10,4 +10,4 @@
- \ No newline at end of file + diff --git a/tests/media_utils/files/subtitles/encapsulated.sami b/tests/media_utils/files/subtitles/encapsulated.sami index 64c66ed4..8aedc1ef 100644 --- a/tests/media_utils/files/subtitles/encapsulated.sami +++ b/tests/media_utils/files/subtitles/encapsulated.sami @@ -41,4 +41,4 @@ P { margin-left: 1pt;

<LAUGHING & WHOOPS!>

- \ No newline at end of file + diff --git a/tests/media_utils/files/subtitles/not.txt b/tests/media_utils/files/subtitles/not.txt index f81d8613..3ae16f30 100644 --- a/tests/media_utils/files/subtitles/not.txt +++ b/tests/media_utils/files/subtitles/not.txt @@ -1 +1 @@ -This file doesn't contain subtitles nor isn't it a subtitle format. \ No newline at end of file +This file doesn't contain subtitles nor isn't it a subtitle format. diff --git a/tests/media_utils/test_proxy.py b/tests/media_utils/test_proxy.py index 1b28590e..8775ffc8 100644 --- a/tests/media_utils/test_proxy.py +++ b/tests/media_utils/test_proxy.py @@ -6,20 +6,23 @@ from ricecooker.utils.youtube import YouTubeResource -YOUTUBE_TEST_VIDEO = 'https://www.youtube.com/watch?v=C0DPdy98e4c' -YOUTUBE_TEST_PLAYLIST = 'https://www.youtube.com/playlist?list=PL472BC6F4F2C3ABEF' +YOUTUBE_TEST_VIDEO = "https://www.youtube.com/watch?v=C0DPdy98e4c" +YOUTUBE_TEST_PLAYLIST = "https://www.youtube.com/playlist?list=PL472BC6F4F2C3ABEF" # This test takes a few minutes, but is very useful for checking that the proxy is not being ignored, # so mark it to run when the PYTEST_RUN_SLOW env var is set. -@pytest.mark.skipif(not 'PYTEST_RUN_SLOW' in os.environ, reason="This test takes several minutes to complete.") +@pytest.mark.skipif( + not "PYTEST_RUN_SLOW" in os.environ, + reason="This test takes several minutes to complete.", +) def test_bad_proxies_get_banned(tmp_path): # create some fake proxies... FAKE_PROXIES = [ - '123.123.123.123:1234', - '142.123.1.234:123345', - '156.245.233.211:12323', - '11.22.33.44:123', + "123.123.123.123:1234", + "142.123.1.234:123345", + "156.245.233.211:12323", + "11.22.33.44:123", ] # initialize PROXY_LIST to known-bad proxies to check that they get banned proxy.PROXY_LIST = FAKE_PROXIES.copy() @@ -31,7 +34,10 @@ def test_bad_proxies_get_banned(tmp_path): assert set(FAKE_PROXIES).issubset(set(proxy.BROKEN_PROXIES)) -@pytest.mark.skipif(not 'PYTEST_RUN_SLOW' in os.environ, reason="This test can take several minutes to complete.") +@pytest.mark.skipif( + not "PYTEST_RUN_SLOW" in os.environ, + reason="This test can take several minutes to complete.", +) def test_proxy_download(tmp_path): proxy.get_proxies(refresh=True) assert len(proxy.PROXY_LIST) > 1 @@ -39,21 +45,29 @@ def test_proxy_download(tmp_path): video = YouTubeResource(YOUTUBE_TEST_VIDEO) video.download(tmp_path) - temp_files = os.listdir(os.path.join(tmp_path, 'Watch')) + temp_files = os.listdir(os.path.join(tmp_path, "Watch")) has_video = False for afile in temp_files: - if afile.endswith('.mp4'): + if afile.endswith(".mp4"): has_video = True - assert has_video, 'Video file not found' + assert has_video, "Video file not found" -@pytest.mark.skipif(not 'PYTEST_RUN_SLOW' in os.environ, reason="This test can take several minutes to complete.") +@pytest.mark.skipif( + not "PYTEST_RUN_SLOW" in os.environ, + reason="This test can take several minutes to complete.", +) def test_proxy_playlist_download(tmp_path): playlist = YouTubeResource(YOUTUBE_TEST_PLAYLIST) playlist.download(tmp_path) - temp_files = os.listdir(os.path.join(tmp_path, 'Playlist')) - expected = ['zbkizy-Y3qw.jpg', 'oXnzstpBEOg.mp4', 'oXnzstpBEOg.jpg', 'zbkizy-Y3qw.mp4'] + temp_files = os.listdir(os.path.join(tmp_path, "Playlist")) + expected = [ + "zbkizy-Y3qw.jpg", + "oXnzstpBEOg.mp4", + "oXnzstpBEOg.jpg", + "zbkizy-Y3qw.mp4", + ] - assert set(temp_files) == set(expected) + assert set(temp_files) == set(expected) diff --git a/tests/media_utils/test_subtitles.py b/tests/media_utils/test_subtitles.py index 60a71bef..144bb5a5 100644 --- a/tests/media_utils/test_subtitles.py +++ b/tests/media_utils/test_subtitles.py @@ -1,35 +1,44 @@ import codecs -import os import hashlib +import os import tempfile from unittest import TestCase + +from le_utils.constants import file_formats +from le_utils.constants import languages + from ricecooker.utils.subtitles import build_subtitle_converter_from_file -from ricecooker.utils.subtitles import LANGUAGE_CODE_UNKNOWN from ricecooker.utils.subtitles import InvalidSubtitleFormatError from ricecooker.utils.subtitles import InvalidSubtitleLanguageError -from le_utils.constants import languages, file_formats +from ricecooker.utils.subtitles import LANGUAGE_CODE_UNKNOWN -test_files_dir = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'files', 'subtitles') +test_files_dir = os.path.join( + os.path.abspath(os.path.dirname(__file__)), "files", "subtitles" +) class SubtitleConverterTest(TestCase): def get_file_hash(self, path): hash = hashlib.md5() - with open(path, 'rb') as fobj: + with open(path, "rb") as fobj: for chunk in iter(lambda: fobj.read(2097152), b""): hash.update(chunk) return hash.hexdigest() def assertFilesEqual(self, expected_file, actual_file): - with codecs.open(actual_file, 'rb', encoding='utf-8') as act, codecs.open(expected_file, 'rb', encoding='utf-8') as exp: + with codecs.open(actual_file, "rb", encoding="utf-8") as act, codecs.open( + expected_file, "rb", encoding="utf-8" + ) as exp: for actual_str, expected_str in zip(act.readlines(), exp.readlines()): self.assertEqual(actual_str.strip(), expected_str.strip()) def test_replace_unknown_language(self): - expected_language = languages.getlang_by_name('Arabic') + expected_language = languages.getlang_by_name("Arabic") - converter = build_subtitle_converter_from_file(os.path.join(test_files_dir, 'basic.srt')) + converter = build_subtitle_converter_from_file( + os.path.join(test_files_dir, "basic.srt") + ) self.assertTrue(converter.has_language(LANGUAGE_CODE_UNKNOWN)) converter.replace_unknown_language(expected_language.code) @@ -38,10 +47,12 @@ def test_replace_unknown_language(self): self.assertFalse(converter.has_language(LANGUAGE_CODE_UNKNOWN)) def test_srt_conversion(self): - expected_file = os.path.join(test_files_dir, 'basic.vtt') - expected_language = languages.getlang_by_name('Arabic') + expected_file = os.path.join(test_files_dir, "basic.vtt") + expected_language = languages.getlang_by_name("Arabic") - converter = build_subtitle_converter_from_file(os.path.join(test_files_dir, 'basic.srt')) + converter = build_subtitle_converter_from_file( + os.path.join(test_files_dir, "basic.srt") + ) converter.replace_unknown_language(expected_language.code) actual_file_d, actual_file_name = tempfile.mkstemp() @@ -54,11 +65,12 @@ def test_srt_conversion(self): def test_expected_srt_conversion(self): expected_format = file_formats.SRT - expected_file = os.path.join(test_files_dir, 'basic.vtt') - expected_language = languages.getlang_by_name('Arabic') + expected_file = os.path.join(test_files_dir, "basic.vtt") + expected_language = languages.getlang_by_name("Arabic") converter = build_subtitle_converter_from_file( - os.path.join(test_files_dir, 'basic.srt'), in_format=expected_format) + os.path.join(test_files_dir, "basic.srt"), in_format=expected_format + ) converter.replace_unknown_language(expected_language.code) actual_file_d, actual_file_name = tempfile.mkstemp() @@ -69,39 +81,44 @@ def test_expected_srt_conversion(self): os.close(actual_file_d) os.remove(actual_file_name) - def test_not_expected_type(self): expected_format = file_formats.SCC - expected_language = languages.getlang_by_name('Arabic') + expected_language = languages.getlang_by_name("Arabic") converter = build_subtitle_converter_from_file( - os.path.join(test_files_dir, 'basic.srt'), in_format=expected_format) + os.path.join(test_files_dir, "basic.srt"), in_format=expected_format + ) with self.assertRaises(InvalidSubtitleFormatError): converter.convert(expected_language.code) def test_invalid_format(self): - expected_language = languages.getlang_by_name('English') + expected_language = languages.getlang_by_name("English") - converter = build_subtitle_converter_from_file(os.path.join(test_files_dir, 'not.txt')) + converter = build_subtitle_converter_from_file( + os.path.join(test_files_dir, "not.txt") + ) with self.assertRaises(InvalidSubtitleFormatError): converter.convert(expected_language.code) def test_invalid_format__empty(self): - expected_language = languages.getlang_by_name('English') + expected_language = languages.getlang_by_name("English") - converter = build_subtitle_converter_from_file(os.path.join(test_files_dir, 'empty.ttml')) + converter = build_subtitle_converter_from_file( + os.path.join(test_files_dir, "empty.ttml") + ) - with self.assertRaises(InvalidSubtitleFormatError, msg='Caption file is empty'): + with self.assertRaises(InvalidSubtitleFormatError, msg="Caption file is empty"): converter.convert(expected_language.code) def test_valid_language(self): - expected_file = os.path.join(test_files_dir, 'encapsulated.vtt') - expected_language = languages.getlang_by_name('English') + expected_file = os.path.join(test_files_dir, "encapsulated.vtt") + expected_language = languages.getlang_by_name("English") converter = build_subtitle_converter_from_file( - os.path.join(test_files_dir, 'encapsulated.sami')) + os.path.join(test_files_dir, "encapsulated.sami") + ) self.assertTrue(converter.has_language(expected_language.code)) actual_file_d, actual_file_name = tempfile.mkstemp() @@ -113,10 +130,11 @@ def test_valid_language(self): os.remove(actual_file_name) def test_invalid_language(self): - expected_language = languages.getlang_by_name('Spanish') + expected_language = languages.getlang_by_name("Spanish") converter = build_subtitle_converter_from_file( - os.path.join(test_files_dir, 'encapsulated.sami')) + os.path.join(test_files_dir, "encapsulated.sami") + ) with self.assertRaises(InvalidSubtitleLanguageError): converter.convert(expected_language.code) diff --git a/tests/media_utils/test_thumbnails.py b/tests/media_utils/test_thumbnails.py index aace30c0..c075c780 100644 --- a/tests/media_utils/test_thumbnails.py +++ b/tests/media_utils/test_thumbnails.py @@ -1,33 +1,34 @@ import os + import PIL import pytest +from .test_videos import bad_video +from .test_videos import high_res_video +from .test_videos import low_res_video +from .test_videos import TempFile from ricecooker.utils import images from ricecooker.utils import videos -from .test_videos import low_res_video, high_res_video, bad_video, TempFile - tests_dir = os.path.dirname(os.path.abspath(__file__)) -files_dir = os.path.join(tests_dir, 'files') -outputs_dir = os.path.join(files_dir, 'expected_output') +files_dir = os.path.join(tests_dir, "files") +outputs_dir = os.path.join(files_dir, "expected_output") # these settings are chosen to match our current use case in Studio -studio_cmap_options = {'name': 'BuPu', 'vmin': 0.3, 'vmax': 0.7, 'color': 'black'} - - -SHOW_THUMBS = False # set to True to show outputs when running tests locally +studio_cmap_options = {"name": "BuPu", "vmin": 0.3, "vmax": 0.7, "color": "black"} +SHOW_THUMBS = False # set to True to show outputs when running tests locally # TESTS ################################################################################ -class BaseThumbnailGeneratorTestCase(object): +class BaseThumbnailGeneratorTestCase(object): def check_is_png_file(self, output_file): - PNG_MAGIC_NUMBER = b'\x89P' + PNG_MAGIC_NUMBER = b"\x89P" with open(output_file, "rb") as f: f.seek(0) assert f.read(2) == PNG_MAGIC_NUMBER @@ -40,9 +41,10 @@ def check_thumbnail_generated(self, output_file): assert os.path.exists(output_file) im = PIL.Image.open(output_file) width, height = im.size - if SHOW_THUMBS: im.show() - assert width < 1000, 'thumbnail generated is too large (w >= 1000)' - assert height < 1000, 'thumbnail generated is too tall (h >= 1000)' + if SHOW_THUMBS: + im.show() + assert width < 1000, "thumbnail generated is too large (w >= 1000)" + assert height < 1000, "thumbnail generated is too tall (h >= 1000)" return im def check_16_9_format(self, output_file): @@ -53,12 +55,13 @@ def check_16_9_format(self, output_file): assert os.path.exists(output_file) im = PIL.Image.open(output_file) width, height = im.size - assert float(width)/float(height) == 16.0/9.0 - if SHOW_THUMBS: im.show() + assert float(width) / float(height) == 16.0 / 9.0 + if SHOW_THUMBS: + im.show() return im -class Test_pdf_thumbnail_generation(BaseThumbnailGeneratorTestCase): +class Test_pdf_thumbnail_generation(BaseThumbnailGeneratorTestCase): def test_generates_thumbnail(self, tmpdir): input_file = os.path.join(files_dir, "generate_thumbnail", "sample.pdf") assert os.path.exists(input_file) @@ -70,205 +73,201 @@ def test_generates_16_9_thumbnail(self, tmpdir): input_file = os.path.join(files_dir, "generate_thumbnail", "sample.pdf") assert os.path.exists(input_file) output_file = tmpdir.join("pdf_16_9.png").strpath - images.create_image_from_pdf_page(input_file, output_file, crop='smart') + images.create_image_from_pdf_page(input_file, output_file, crop="smart") self.check_16_9_format(output_file) def test_raises_for_missing_file(self, tmpdir): - input_file = os.path.join(files_dir, 'file_that_does_not_exist.pdf') + input_file = os.path.join(files_dir, "file_that_does_not_exist.pdf") assert not os.path.exists(input_file) - output_file = tmpdir.join('thumbnail.png').strpath + output_file = tmpdir.join("thumbnail.png").strpath with pytest.raises(images.ThumbnailGenerationError): images.create_image_from_pdf_page(input_file, output_file) def test_raises_for_invalid_pdf(self, tmpdir, bad_pdf_file): input_file = bad_pdf_file.name - output_file = tmpdir.join('thumbnail.png').strpath + output_file = tmpdir.join("thumbnail.png").strpath with pytest.raises(images.ThumbnailGenerationError): images.create_image_from_pdf_page(input_file, output_file) class Test_html_zip_thumbnail_generation(BaseThumbnailGeneratorTestCase): - def test_generates_16_9_thumbnail(self, tmpdir): """ The test fixtue `sample.zip` contains three images, one tall, one wide, and one roughly square. The "choose largest area" logic shoudl select the blue one to use as the thumbnail. """ - input_file = os.path.join(files_dir, 'generate_thumbnail', 'sample.zip') + input_file = os.path.join(files_dir, "generate_thumbnail", "sample.zip") assert os.path.exists(input_file) - output_file = tmpdir.join('zipfile.png').strpath + output_file = tmpdir.join("zipfile.png").strpath images.create_image_from_zip(input_file, output_file) im = self.check_16_9_format(output_file) # check is blue image r, g, b = im.getpixel((1, 1)) - assert b>g and b>r, (r,g,b) + assert b > g and b > r, (r, g, b) def test_raises_for_missing_file(self, tmpdir): - input_file = os.path.join(files_dir, 'file_that_does_not_exist.zip') + input_file = os.path.join(files_dir, "file_that_does_not_exist.zip") assert not os.path.exists(input_file) - output_file = tmpdir.join('thumbnail.png').strpath + output_file = tmpdir.join("thumbnail.png").strpath with pytest.raises(images.ThumbnailGenerationError): images.create_image_from_zip(input_file, output_file) def test_raises_for_invalid_zip(self, tmpdir, bad_zip_file): input_file = bad_zip_file.name - output_file = tmpdir.join('thumbnail.png').strpath + output_file = tmpdir.join("thumbnail.png").strpath with pytest.raises(images.ThumbnailGenerationError): images.create_image_from_zip(input_file, output_file) - class Test_tiled_thumbnail_generation(BaseThumbnailGeneratorTestCase): - def test_generates_brazil_thumbnail(self, tmpdir): - input_file = os.path.join(files_dir, 'thumbnails', 'BRAlogo1.png') + input_file = os.path.join(files_dir, "thumbnails", "BRAlogo1.png") assert os.path.exists(input_file) input_files = [input_file, input_file, input_file, input_file] - output_file = tmpdir.join('tiled.png').strpath + output_file = tmpdir.join("tiled.png").strpath images.create_tiled_image(input_files, output_file) self.check_16_9_format(output_file) def test_generates_kolibris_thumbnail(self, tmpdir): - filenames = ['BRAlogo1.png', 'toosquare.png', 'tootall.png', 'toowide.png'] + filenames = ["BRAlogo1.png", "toosquare.png", "tootall.png", "toowide.png"] input_files = [] for filename in filenames: - input_file = os.path.join(files_dir, 'thumbnails', filename) + input_file = os.path.join(files_dir, "thumbnails", filename) assert os.path.exists(input_file) input_files.append(input_file) - output_file = tmpdir.join('tiled.png').strpath + output_file = tmpdir.join("tiled.png").strpath images.create_tiled_image(input_files, output_file) self.check_16_9_format(output_file) def test_raises_for_missing_file(self, tmpdir): - input_file = os.path.join(files_dir, 'file_that_does_not_exist.png') + input_file = os.path.join(files_dir, "file_that_does_not_exist.png") assert not os.path.exists(input_file) input_files = [input_file, input_file, input_file, input_file] - output_file = tmpdir.join('tiled.png').strpath + output_file = tmpdir.join("tiled.png").strpath with pytest.raises(images.ThumbnailGenerationError): images.create_tiled_image(input_files, output_file) def test_raises_for_wrong_number_of_files(self, tmpdir): - input_file = os.path.join(files_dir, 'file_that_does_not_exist.png') + input_file = os.path.join(files_dir, "file_that_does_not_exist.png") assert not os.path.exists(input_file) input_files = [input_file, input_file, input_file, input_file] - output_file = tmpdir.join('tiled.png').strpath + output_file = tmpdir.join("tiled.png").strpath with pytest.raises(images.ThumbnailGenerationError): images.create_tiled_image(input_files, output_file) def test_raises_for_invalid_png(self, tmpdir, bad_png_file): input_file = bad_png_file.name input_files = [input_file, input_file, input_file, input_file] - output_file = tmpdir.join('tiled.png').strpath + output_file = tmpdir.join("tiled.png").strpath with pytest.raises(images.ThumbnailGenerationError): images.create_tiled_image(input_files, output_file) class Test_epub_thumbnail_generation(BaseThumbnailGeneratorTestCase): - def test_generates_thumbnail(self, tmpdir): - input_file = os.path.join(files_dir, 'generate_thumbnail', 'sample.epub') + input_file = os.path.join(files_dir, "generate_thumbnail", "sample.epub") assert os.path.exists(input_file) - output_file = tmpdir.join('epub.png').strpath + output_file = tmpdir.join("epub.png").strpath images.create_image_from_epub(input_file, output_file) self.check_thumbnail_generated(output_file) def test_generates_16_9_thumbnail(self, tmpdir): - input_file = os.path.join(files_dir, 'generate_thumbnail', 'sample.epub') + input_file = os.path.join(files_dir, "generate_thumbnail", "sample.epub") assert os.path.exists(input_file) - output_file = tmpdir.join('epub_16_9.png').strpath - images.create_image_from_epub(input_file, output_file, crop='smart') + output_file = tmpdir.join("epub_16_9.png").strpath + images.create_image_from_epub(input_file, output_file, crop="smart") self.check_16_9_format(output_file) def test_generates_16_9_thumbnail_from_top(self, tmpdir): - input_file = os.path.join(files_dir, 'generate_thumbnail', 'sample.epub') + input_file = os.path.join(files_dir, "generate_thumbnail", "sample.epub") assert os.path.exists(input_file) - output_file = tmpdir.join('epub_16_9_top.png').strpath + output_file = tmpdir.join("epub_16_9_top.png").strpath images.create_image_from_epub(input_file, output_file, crop=",0") self.check_16_9_format(output_file) def test_raises_for_missing_file(self, tmpdir): - input_file = os.path.join(files_dir, 'file_that_does_not_exist.epub') + input_file = os.path.join(files_dir, "file_that_does_not_exist.epub") assert not os.path.exists(input_file) - output_file = tmpdir.join('thumbnail.png').strpath + output_file = tmpdir.join("thumbnail.png").strpath with pytest.raises(images.ThumbnailGenerationError): images.create_image_from_epub(input_file, output_file) def test_raises_for_invalid_epub(self, tmpdir, bad_epub_file): input_file = bad_epub_file.name - output_file = tmpdir.join('thumbnail.png').strpath + output_file = tmpdir.join("thumbnail.png").strpath with pytest.raises(images.ThumbnailGenerationError): images.create_image_from_epub(input_file, output_file) - class Test_video_thumbnail_generation(BaseThumbnailGeneratorTestCase): - def test_generates_16_9_thumbnail_from_low_res(self, tmpdir, low_res_video): input_file = low_res_video.name - output_file = tmpdir.join('low_res_video_thumbnail.png').strpath + output_file = tmpdir.join("low_res_video_thumbnail.png").strpath videos.extract_thumbnail_from_video(input_file, output_file, overwrite=True) self.check_16_9_format(output_file) self.check_is_png_file(output_file) def test_generates_16_9_thumbnail_from_high_res(self, tmpdir, high_res_video): input_file = high_res_video.name - output_file = tmpdir.join('high_res_video_thumbnail.png').strpath + output_file = tmpdir.join("high_res_video_thumbnail.png").strpath videos.extract_thumbnail_from_video(input_file, output_file, overwrite=True) self.check_16_9_format(output_file) self.check_is_png_file(output_file) def test_raises_for_missing_file(self, tmpdir): - input_file = os.path.join(files_dir, 'file_that_does_not_exist.mp4') + input_file = os.path.join(files_dir, "file_that_does_not_exist.mp4") assert not os.path.exists(input_file) - output_file = tmpdir.join('thumbnail.png').strpath + output_file = tmpdir.join("thumbnail.png").strpath with pytest.raises(images.ThumbnailGenerationError): videos.extract_thumbnail_from_video(input_file, output_file, overwrite=True) def test_bad_video_raises(self, tmpdir, bad_video): input_file = bad_video.name - output_file = tmpdir.join('bad_video_thumbnail.png').strpath + output_file = tmpdir.join("bad_video_thumbnail.png").strpath with pytest.raises(images.ThumbnailGenerationError): videos.extract_thumbnail_from_video(input_file, output_file, overwrite=True) - - # FIXTURES ################################################################################ + @pytest.fixture def bad_audio_file(): - with TempFile(suffix='.mp3') as f: - f.write(b'no mp3 here; ffmpeg should error out.') + with TempFile(suffix=".mp3") as f: + f.write(b"no mp3 here; ffmpeg should error out.") f.flush() - return f # returns a temporary file with a closed file descriptor + return f # returns a temporary file with a closed file descriptor + @pytest.fixture def bad_pdf_file(): - with TempFile(suffix='.pdf') as f: - f.write(b'no pdf here; thumbnail extraction should error out.') + with TempFile(suffix=".pdf") as f: + f.write(b"no pdf here; thumbnail extraction should error out.") f.flush() - return f # returns a temporary file with a closed file descriptor + return f # returns a temporary file with a closed file descriptor + @pytest.fixture def bad_zip_file(): - with TempFile(suffix='.zip') as f: - f.write(b'no zip here; thumbnail extraction should error out.') + with TempFile(suffix=".zip") as f: + f.write(b"no zip here; thumbnail extraction should error out.") f.flush() - return f # returns a temporary file with a closed file descriptor + return f # returns a temporary file with a closed file descriptor + @pytest.fixture def bad_epub_file(): - with TempFile(suffix='.epub') as f: - f.write(b'no epub here; thumbnail extraction should error out.') + with TempFile(suffix=".epub") as f: + f.write(b"no epub here; thumbnail extraction should error out.") f.flush() - return f # returns a temporary file with a closed file descriptor + return f # returns a temporary file with a closed file descriptor + @pytest.fixture def bad_png_file(): - with TempFile(suffix='.png') as f: - f.write(b'no image here; tiled thumbnail processing should error out.') + with TempFile(suffix=".png") as f: + f.write(b"no image here; tiled thumbnail processing should error out.") f.flush() - return f # returns a temporary file with a closed file descriptor - + return f # returns a temporary file with a closed file descriptor diff --git a/tests/media_utils/test_videos.py b/tests/media_utils/test_videos.py index 2c74dde7..43e9fec8 100644 --- a/tests/media_utils/test_videos.py +++ b/tests/media_utils/test_videos.py @@ -1,16 +1,18 @@ from __future__ import print_function + import atexit import os -import PIL -import pytest import re -import requests -import requests_cache import subprocess import sys import tempfile +import PIL +import pytest +import requests +import requests_cache from le_utils.constants import format_presets + from ricecooker.utils import videos @@ -24,9 +26,10 @@ # FIXTURES ################################################################################ + @pytest.fixture def low_res_video(): - with TempFile(suffix='.mp4') as f: + with TempFile(suffix=".mp4") as f: resp = requests.get( "https://archive.org/download/vd_is_for_everybody/vd_is_for_everybody_512kb.mp4", stream=True, @@ -34,100 +37,102 @@ def low_res_video(): for chunk in resp.iter_content(chunk_size=1048576): f.write(chunk) f.flush() - return f # returns a temporary file with a closed file descriptor + return f # returns a temporary file with a closed file descriptor @pytest.fixture def high_res_video(): - with TempFile(suffix='.mp4') as f: + with TempFile(suffix=".mp4") as f: resp = requests.get( "https://ia800201.us.archive.org/7/items/" "UnderConstructionFREEVideoBackgroundLoopHD1080p/" "UnderConstruction%20-%20FREE%20Video%20Background%20Loop%20HD%201080p.mp4", - stream=True + stream=True, ) for chunk in resp.iter_content(chunk_size=1048576): f.write(chunk) f.flush() - return f # returns a temporary file with a closed file descriptor + return f # returns a temporary file with a closed file descriptor @pytest.fixture def low_res_video_webm(): - with TempFile(suffix='.webm') as f: + with TempFile(suffix=".webm") as f: resp = requests.get( "https://mirrors.creativecommons.org/movingimages/" "webm/CreativeCommonsPlusCommercial_240p.webm", - stream=True + stream=True, ) for chunk in resp.iter_content(chunk_size=1048576): f.write(chunk) f.flush() - return f # returns a temporary file with a closed file descriptor + return f # returns a temporary file with a closed file descriptor @pytest.fixture def high_res_video_webm(): - with TempFile(suffix='.webm') as f: + with TempFile(suffix=".webm") as f: resp = requests.get( "https://mirrors.creativecommons.org/movingimages/" "webm/CreativeCommonsPlusCommercial_720p.webm", - stream=True + stream=True, ) for chunk in resp.iter_content(chunk_size=1048576): f.write(chunk) f.flush() - return f # returns a temporary file with a closed file descriptor + return f # returns a temporary file with a closed file descriptor @pytest.fixture def low_res_ogv_video(): - with TempFile(suffix='.ogv') as f: + with TempFile(suffix=".ogv") as f: resp = requests.get( "https://archive.org/download/" "UnderConstructionFREEVideoBackgroundLoopHD1080p/" "UnderConstruction%20-%20FREE%20Video%20Background%20Loop%20HD%201080p.ogv", - stream=True + stream=True, ) for chunk in resp.iter_content(chunk_size=1048576): f.write(chunk) f.flush() - return f # returns a temporary file with a closed file descriptor + return f # returns a temporary file with a closed file descriptor @pytest.fixture def high_res_mov_video(): - with TempFile(suffix='.mov') as f: + with TempFile(suffix=".mov") as f: resp = requests.get( "https://ia800201.us.archive.org/7/items/" "UnderConstructionFREEVideoBackgroundLoopHD1080p/" "cold%20night.mov", - stream=True + stream=True, ) for chunk in resp.iter_content(chunk_size=1048576): f.write(chunk) f.flush() - return f # returns a temporary file with a closed file descriptor + return f # returns a temporary file with a closed file descriptor @pytest.fixture def bad_video(): - with TempFile(suffix='.mp4') as f: - f.write(b'novideohere. ffmpeg soshould error') + with TempFile(suffix=".mp4") as f: + f.write(b"novideohere. ffmpeg soshould error") f.flush() - return f # returns a temporary file with a closed file descriptor + return f # returns a temporary file with a closed file descriptor # TESTS ################################################################################ -class Test_check_video_resolution: +class Test_check_video_resolution: def test_returns_a_format_preset(self, low_res_video): preset = videos.guess_video_preset_by_resolution(low_res_video.name) - assert preset in [format_presets.VIDEO_HIGH_RES, - format_presets.VIDEO_LOW_RES, - format_presets.VIDEO_VECTOR] + assert preset in [ + format_presets.VIDEO_HIGH_RES, + format_presets.VIDEO_LOW_RES, + format_presets.VIDEO_VECTOR, + ] def test_detects_low_res_videos(self, low_res_video): preset = videos.guess_video_preset_by_resolution(low_res_video.name) @@ -138,89 +143,98 @@ def test_detects_high_res_videos(self, high_res_video): assert preset == format_presets.VIDEO_HIGH_RES def test_detects_low_res_videos_webm(self, low_res_video_webm): - preset = videos.guess_video_preset_by_resolution( - low_res_video_webm.name) + preset = videos.guess_video_preset_by_resolution(low_res_video_webm.name) assert preset == format_presets.VIDEO_LOW_RES def test_detects_high_res_videos_webm(self, high_res_video_webm): - preset = videos.guess_video_preset_by_resolution( - high_res_video_webm.name) + preset = videos.guess_video_preset_by_resolution(high_res_video_webm.name) assert preset == format_presets.VIDEO_HIGH_RES def get_resolution(videopath): """Helper function to get resolution of video at videopath.""" - result = subprocess.check_output(['ffprobe', '-v', 'error', - '-print_format', 'json', '-show_entries', - 'stream=width,height', '-of', - 'default=noprint_wrappers=1', - str(videopath)]) - pattern = re.compile('width=([0-9]*)[^height]+height=([0-9]*)') + result = subprocess.check_output( + [ + "ffprobe", + "-v", + "error", + "-print_format", + "json", + "-show_entries", + "stream=width,height", + "-of", + "default=noprint_wrappers=1", + str(videopath), + ] + ) + pattern = re.compile("width=([0-9]*)[^height]+height=([0-9]*)") m = pattern.search(str(result)) width, height = int(m.group(1)), int(m.group(2)) return width, height class Test_compress_video: - def test_compression_works(self, high_res_video): with TempFile(suffix=".mp4") as vout: - videos.compress_video(high_res_video.name, - vout.name, overwrite=True) + videos.compress_video(high_res_video.name, vout.name, overwrite=True) width, height = get_resolution(vout.name) - assert height == 480, 'should compress to 480 v resolution by default' + assert height == 480, "should compress to 480 v resolution by default" def test_compression_max_width(self, high_res_video): with TempFile(suffix=".mp4") as vout: - videos.compress_video(high_res_video.name, - vout.name, overwrite=True, max_width=120) + videos.compress_video( + high_res_video.name, vout.name, overwrite=True, max_width=120 + ) width, height = get_resolution(vout.name) - assert width == 120, 'should be 120 h resolution since max_width set' + assert width == 120, "should be 120 h resolution since max_width set" def test_compression_max_width_odd(self, high_res_video): """ regression test for: https://github.com/learningequality/pressurecooker/issues/11 """ with TempFile(suffix=".mp4") as vout: - videos.compress_video(high_res_video.name, - vout.name, overwrite=True, max_width=121) + videos.compress_video( + high_res_video.name, vout.name, overwrite=True, max_width=121 + ) width, height = get_resolution(vout.name) - assert width == 120, 'should round down to 120 h resolution when max_width=121 set' + assert ( + width == 120 + ), "should round down to 120 h resolution when max_width=121 set" def test_compression_max_height(self, high_res_video): with TempFile(suffix=".mp4") as vout: - videos.compress_video(high_res_video.name, - vout.name, overwrite=True, max_height=140) + videos.compress_video( + high_res_video.name, vout.name, overwrite=True, max_height=140 + ) width, height = get_resolution(vout.name) - assert height == 140, 'should be 140 v resolution since max_height set' + assert height == 140, "should be 140 v resolution since max_height set" def test_raises_for_bad_file(self, bad_video): with TempFile(suffix=".mp4") as vout: with pytest.raises(videos.VideoCompressionError): - videos.compress_video( - bad_video.name, vout.name, overwrite=True) + videos.compress_video(bad_video.name, vout.name, overwrite=True) class Test_convert_video: - def test_convert_mov_works(self, high_res_mov_video): with TempFile(suffix=".mp4") as vout: - videos.compress_video(high_res_mov_video.name, - vout.name, overwrite=True) + videos.compress_video(high_res_mov_video.name, vout.name, overwrite=True) width, height = get_resolution(vout.name) - assert height == 480, 'should convert .ogv to .mp4 and set 480 v res' + assert height == 480, "should convert .ogv to .mp4 and set 480 v res" def test_convert_and_resize_ogv_works(self, low_res_ogv_video): with TempFile(suffix=".mp4") as vout: - videos.compress_video(low_res_ogv_video.name, - vout.name, overwrite=True, max_height=200) + videos.compress_video( + low_res_ogv_video.name, vout.name, overwrite=True, max_height=200 + ) width, height = get_resolution(vout.name) - assert height == 200, 'should convert .ogv to .mp4 and set 200 v res' + assert height == 200, "should convert .ogv to .mp4 and set 200 v res" # Helper class for cross-platform temporary files ################################################################################ + def remove_temp_file(*args, **kwargs): filename = args[0] try: @@ -245,8 +259,7 @@ def __init__(self, *args, **kwargs): def __enter__(self): # create a temporary file as per usual, but set it up to be deleted once we're done - self.f = tempfile.NamedTemporaryFile( - *self.args, delete=False, **self.kwargs) + self.f = tempfile.NamedTemporaryFile(*self.args, delete=False, **self.kwargs) atexit.register(remove_temp_file, self.f.name) return self.f diff --git a/tests/media_utils/test_web.py b/tests/media_utils/test_web.py index 1f03438d..84add7c7 100644 --- a/tests/media_utils/test_web.py +++ b/tests/media_utils/test_web.py @@ -12,13 +12,13 @@ def test_get_links(): links = parser.get_links() expected_links = [ - 'assets/css/empty.css', - 'assets/css/empty2.css', - 'assets/js/empty.js', - 'assets/images/4933759886_098e9acf93_m.jpg', - 'the_spanish_inquisition.html', - 'http://www.learningequality.org', - 'Wilhelm_Scream.mp3' + "assets/css/empty.css", + "assets/css/empty2.css", + "assets/js/empty.js", + "assets/images/4933759886_098e9acf93_m.jpg", + "the_spanish_inquisition.html", + "http://www.learningequality.org", + "Wilhelm_Scream.mp3", ] # make sure the link order is the same to do an equality test @@ -34,12 +34,12 @@ def test_get_local_files(): links = parser.get_local_files() expected_links = [ - 'assets/css/empty.css', - 'assets/css/empty2.css', - 'assets/js/empty.js', - 'assets/images/4933759886_098e9acf93_m.jpg', - 'the_spanish_inquisition.html', - 'Wilhelm_Scream.mp3' + "assets/css/empty.css", + "assets/css/empty2.css", + "assets/js/empty.js", + "assets/images/4933759886_098e9acf93_m.jpg", + "the_spanish_inquisition.html", + "Wilhelm_Scream.mp3", ] # make sure the link order is the same to do an equality test @@ -54,17 +54,17 @@ def test_replace_links(): parser = web.HTMLParser(filename) original_links = [ - 'assets/css/empty.css', - 'assets/css/empty2.css', - 'assets/js/empty.js', - 'assets/images/4933759886_098e9acf93_m.jpg', - 'the_spanish_inquisition.html', - 'Wilhelm_Scream.mp3' + "assets/css/empty.css", + "assets/css/empty2.css", + "assets/js/empty.js", + "assets/images/4933759886_098e9acf93_m.jpg", + "the_spanish_inquisition.html", + "Wilhelm_Scream.mp3", ] replacement_links = {} for link in original_links: - replacement_links[link] = '/zipcontent/012343545454645454/{}'.format(link) + replacement_links[link] = "/zipcontent/012343545454645454/{}".format(link) new_html = parser.replace_links(replacement_links) diff --git a/tests/media_utils/test_youtube.py b/tests/media_utils/test_youtube.py index 0f3fe0fe..1c3da4d2 100644 --- a/tests/media_utils/test_youtube.py +++ b/tests/media_utils/test_youtube.py @@ -3,29 +3,31 @@ import tempfile import pytest +from le_utils.constants import file_formats from ricecooker.utils import utils from ricecooker.utils import youtube -from le_utils.constants import file_formats trees = {} yt_resources = {} USE_PROXY_FOR_TESTS = False -cc_playlist = 'https://www.youtube.com/playlist?list=PL7m903CwFUgntbjkVMwts89fZq0INCtVS' -non_cc_playlist = 'https://www.youtube.com/playlist?list=PLBO8M-O_dTPE51ymDUgilf8DclGAEg9_A' -subtitles_video = 'https://www.youtube.com/watch?v=6uXAbJQoZlE' -subtitles_zu_video = 'https://www.youtube.com/watch?v=FN12ty5ztAs' +cc_playlist = "https://www.youtube.com/playlist?list=PL7m903CwFUgntbjkVMwts89fZq0INCtVS" +non_cc_playlist = ( + "https://www.youtube.com/playlist?list=PLBO8M-O_dTPE51ymDUgilf8DclGAEg9_A" +) +subtitles_video = "https://www.youtube.com/watch?v=6uXAbJQoZlE" +subtitles_zu_video = "https://www.youtube.com/watch?v=FN12ty5ztAs" def get_yt_resource(url, **kwargs): global yt_resources if not url in yt_resources: - if 'useproxy' not in kwargs: + if "useproxy" not in kwargs: if USE_PROXY_FOR_TESTS: - kwargs['useproxy'] = True + kwargs["useproxy"] = True else: - kwargs['useproxy'] = False + kwargs["useproxy"] = False yt_resources[url] = youtube.YouTubeResource(url, **kwargs) return yt_resources[url] @@ -34,15 +36,15 @@ def get_yt_resource(url, **kwargs): def test_get_youtube_info(): yt_resource = get_yt_resource(non_cc_playlist) tree = yt_resource.get_resource_info() - assert tree['id'] - assert tree['kind'] - assert tree['title'] - assert len(tree['children']) == 4 + assert tree["id"] + assert tree["kind"] + assert tree["title"] + assert len(tree["children"]) == 4 - for video in tree['children']: - assert video['id'] - assert video['kind'] - assert video['title'] + for video in tree["children"]: + assert video["id"] + assert video["kind"] + assert video["title"] def test_warnings_no_license(): @@ -51,7 +53,7 @@ def test_warnings_no_license(): assert len(issues) == 4 for issue in issues: - assert 'no_license_specified' in issue['warnings'] + assert "no_license_specified" in issue["warnings"] def test_cc_no_warnings(): @@ -61,7 +63,7 @@ def test_cc_no_warnings(): # there is one video in this playlist that is not cc-licensed assert len(issues) == 1 for issue in issues: - assert 'no_license_specified' in issue['warnings'] + assert "no_license_specified" in issue["warnings"] @pytest.mark.skipif(True, reason="Skipping download tests.") @@ -73,8 +75,10 @@ def test_download_youtube_video(): info = yt_resource.download(base_path=download_dir) assert info if info: - assert 'filename' in info - assert os.path.exists(info['filename']), 'Filename {} does not exist'.format(info['filename']) + assert "filename" in info + assert os.path.exists( + info["filename"] + ), "Filename {} does not exist".format(info["filename"]) finally: shutil.rmtree(download_dir) @@ -89,11 +93,13 @@ def test_download_youtube_playlist(): info = yt_resource.download(base_path=download_dir) assert info is not None if info: - assert not 'filename' in info - assert 'children' in info - for child in info['children']: - assert 'filename' in child - assert os.path.exists(child['filename']), 'Filename {} does not exist'.format(child['filename']) + assert not "filename" in info + assert "children" in info + for child in info["children"]: + assert "filename" in child + assert os.path.exists( + child["filename"] + ), "Filename {} does not exist".format(child["filename"]) finally: shutil.rmtree(download_dir) @@ -102,14 +108,15 @@ def test_download_youtube_playlist(): def test_get_subtitles(): yt_resource = get_yt_resource(subtitles_video) info = yt_resource.get_resource_subtitles() - assert len(info['subtitles']) == 4 # brittle; can change if subs get added - assert 'ru' in info['subtitles'] - assert 'en' in info['subtitles'] - assert 'zh-CN' in info['subtitles'] - assert 'es' in info['subtitles'] + assert len(info["subtitles"]) == 4 # brittle; can change if subs get added + assert "ru" in info["subtitles"] + assert "en" in info["subtitles"] + assert "zh-CN" in info["subtitles"] + assert "es" in info["subtitles"] + def test_non_youtube_url_error(): - url = 'https://vimeo.com/238190750' + url = "https://vimeo.com/238190750" with pytest.raises(utils.VideoURLFormatError): youtube.YouTubeResource(url) @@ -121,37 +128,44 @@ def test_subtitles_lang_helpers_compatible(): """ yt_resource = get_yt_resource(subtitles_zu_video) info = yt_resource.get_resource_subtitles() - all_subtitles = info['subtitles'] + all_subtitles = info["subtitles"] # 1. filter out non-vtt subs vtt_subtitles = {} for youtube_language, subs in all_subtitles.items(): - vtt_subtitles[youtube_language] = [s for s in subs if s['ext'] == 'vtt'] + vtt_subtitles[youtube_language] = [s for s in subs if s["ext"] == "vtt"] for youtube_language, sub_dict in vtt_subtitles.items(): # 2. check compatibility with le-utils language codes (a.k.a. internal representation) verdict = youtube.is_youtube_subtitle_file_supported_language(youtube_language) - assert verdict == True, 'Wrongly marked youtube_language as incompatible' + assert verdict == True, "Wrongly marked youtube_language as incompatible" # 3. TODO: figure out what to do for incompatible langs # 4. map youtube_language to le-utils language code (a.k.a. internal representation) language_obj = youtube.get_language_with_alpha2_fallback(youtube_language) - assert language_obj is not None, 'Failed to find matchin language code in le-utils' - if youtube_language == 'zu': - assert language_obj.code == 'zul', 'Matched to wrong language code in le-utils' + assert ( + language_obj is not None + ), "Failed to find matchin language code in le-utils" + if youtube_language == "zu": + assert ( + language_obj.code == "zul" + ), "Matched to wrong language code in le-utils" def test_subtitles_lang_helpers_incompatible(): """ Ensure `is_youtube_subtitle_file_supported_language` rejects unknown language codes. """ - verdict1 = youtube.is_youtube_subtitle_file_supported_language('patapata') - assert verdict1 == False, 'Failed to reject incompatible youtube_language' - verdict2 = youtube.is_youtube_subtitle_file_supported_language('zzz') - assert verdict2 == False, 'Failed to reject incompatible youtube_language' + verdict1 = youtube.is_youtube_subtitle_file_supported_language("patapata") + assert verdict1 == False, "Failed to reject incompatible youtube_language" + verdict2 = youtube.is_youtube_subtitle_file_supported_language("zzz") + assert verdict2 == False, "Failed to reject incompatible youtube_language" -@pytest.mark.skipif(not 'PYTEST_RUN_SLOW' in os.environ, reason="This test can take several minutes to complete.") +@pytest.mark.skipif( + not "PYTEST_RUN_SLOW" in os.environ, + reason="This test can take several minutes to complete.", +) @pytest.mark.parametrize("useproxy", [True, False]) @pytest.mark.parametrize("useproxy_for_download", [False]) def test_download_from_web_video_file(tmp_path, useproxy, useproxy_for_download): @@ -165,49 +179,62 @@ def test_download_from_web_video_file(tmp_path, useproxy, useproxy_for_download) # STEP 1: get_resource_info via proxy settings = {} maxheight = 480 - settings['format'] = "bestvideo[height<={maxheight}][ext=mp4]+bestaudio[ext=m4a]/best[height<={maxheight}][ext=mp4]".format(maxheight=maxheight) - settings['outtmpl'] = destination_path - yt_resource = youtube.YouTubeResource(youtube_url, useproxy=useproxy, options=settings) + settings[ + "format" + ] = "bestvideo[height<={maxheight}][ext=mp4]+bestaudio[ext=m4a]/best[height<={maxheight}][ext=mp4]".format( + maxheight=maxheight + ) + settings["outtmpl"] = destination_path + yt_resource = youtube.YouTubeResource( + youtube_url, useproxy=useproxy, options=settings + ) video_node1 = yt_resource.get_resource_info() - assert video_node1, 'no data returned' + assert video_node1, "no data returned" # STEP 2: download # overwrite default download behaviour by setting custom options download_settings = {} download_settings["writethumbnail"] = False download_settings["outtmpl"] = destination_path - video_node2 = yt_resource.download(options=download_settings, useproxy=useproxy_for_download) - assert os.path.exists(destination_path), 'Missing video file' + video_node2 = yt_resource.download( + options=download_settings, useproxy=useproxy_for_download + ) + assert os.path.exists(destination_path), "Missing video file" -@pytest.mark.skipif(not 'PYTEST_RUN_SLOW' in os.environ, reason="This test can take several minutes to complete.") +@pytest.mark.skipif( + not "PYTEST_RUN_SLOW" in os.environ, + reason="This test can take several minutes to complete.", +) @pytest.mark.parametrize("useproxy", [True, False]) @pytest.mark.parametrize("useproxy_for_download", [False]) def test_download_from_web_subtitle_file(tmp_path, useproxy, useproxy_for_download): """ Use YouTubeResource the same way YouTubeSubtitleFile when proxy is enabled. """ - for youtube_url, lang in [(subtitles_video,'ru'), (subtitles_zu_video, 'zu')]: + for youtube_url, lang in [(subtitles_video, "ru"), (subtitles_zu_video, "zu")]: destination_path_noext = os.path.join(tmp_path, youtube_url[-11:]) download_ext = ".{lang}.{ext}".format(lang=lang, ext=file_formats.VTT) destination_path = destination_path_noext + download_ext # STEP 1: get_resource_info settings = { - 'outtmpl': destination_path_noext, # note no ext -- YoutubeDL will auto append it, - 'skip_download': True, - 'writesubtitles': True, - 'subtitleslangs': [lang], - 'subtitlesformat': "best[ext={}]".format(file_formats.VTT), - 'quiet': True, - 'verbose': True, - 'no_warnings': True + "outtmpl": destination_path_noext, # note no ext -- YoutubeDL will auto append it, + "skip_download": True, + "writesubtitles": True, + "subtitleslangs": [lang], + "subtitlesformat": "best[ext={}]".format(file_formats.VTT), + "quiet": True, + "verbose": True, + "no_warnings": True, } web_url = youtube_url - yt_resource = youtube.YouTubeResource(web_url, useproxy=useproxy, options=settings) + yt_resource = youtube.YouTubeResource( + web_url, useproxy=useproxy, options=settings + ) video_node = yt_resource.get_resource_info() # checks for STEP 1 - assert video_node['subtitles'], 'missing subtitles key' + assert video_node["subtitles"], "missing subtitles key" # STEP 2: download # overwrite default download behaviour by setting custom options @@ -216,4 +243,4 @@ def test_download_from_web_subtitle_file(tmp_path, useproxy, useproxy_for_downlo download_settings["outtmpl"] = destination_path_noext yt_resource.download(options=download_settings, useproxy=useproxy_for_download) # checks for STEP 2 - assert os.path.exists(destination_path), 'Missing subtitles file' + assert os.path.exists(destination_path), "Missing subtitles file" diff --git a/tests/test_argparse.py b/tests/test_argparse.py index 5aa44a67..fda1782f 100644 --- a/tests/test_argparse.py +++ b/tests/test_argparse.py @@ -1,60 +1,77 @@ -import pytest import sys +import pytest from mock import patch -from ricecooker.exceptions import InvalidUsageException from ricecooker.chefs import SushiChef +from ricecooker.exceptions import InvalidUsageException @pytest.fixture def cli_args_and_expected(): defaults = { - 'command': 'uploadchannel', - 'update': False, - 'verbose': True, 'debug': False, 'warn': False, 'quiet': False, - 'compress': False, - 'thumbnails': False, - 'download_attempts': 3, - 'resume': False, 'step': 'LAST', 'prompt': False, 'reset_deprecated': False, - 'stage': True, 'stage_deprecated': False, - 'publish': False, - 'sample': None, + "command": "uploadchannel", + "update": False, + "verbose": True, + "debug": False, + "warn": False, + "quiet": False, + "compress": False, + "thumbnails": False, + "download_attempts": 3, + "resume": False, + "step": "LAST", + "prompt": False, + "reset_deprecated": False, + "stage": True, + "stage_deprecated": False, + "publish": False, + "sample": None, } return [ - { # this used to be the old recommended CLI args to run chefs - 'cli_input': './sushichef.py -v --reset --token=letoken', - 'expected_args': dict(defaults, token='letoken', reset_deprecated=True), - 'expected_options': {}, + { # this used to be the old recommended CLI args to run chefs + "cli_input": "./sushichef.py -v --reset --token=letoken", + "expected_args": dict(defaults, token="letoken", reset_deprecated=True), + "expected_options": {}, }, - { # nowadays we've changed the CLI defaults so don't need to specify these - 'cli_input': './sushichef.py --token=letoken', - 'expected_args': dict(defaults, token='letoken'), - 'expected_options': {}, + { # nowadays we've changed the CLI defaults so don't need to specify these + "cli_input": "./sushichef.py --token=letoken", + "expected_args": dict(defaults, token="letoken"), + "expected_options": {}, }, { - 'cli_input': './sushichef.py --token=letoken --resume --step=START_UPLOAD', - 'expected_args': dict(defaults, token='letoken', resume=True, step='START_UPLOAD'), - 'expected_options': {}, + "cli_input": "./sushichef.py --token=letoken --resume --step=START_UPLOAD", + "expected_args": dict( + defaults, token="letoken", resume=True, step="START_UPLOAD" + ), + "expected_options": {}, }, { - 'cli_input': './sushichef.py --token=letoken lang=fr', - 'expected_args': dict(defaults, token='letoken'), - 'expected_options': dict(lang='fr') + "cli_input": "./sushichef.py --token=letoken lang=fr", + "expected_args": dict(defaults, token="letoken"), + "expected_options": dict(lang="fr"), }, { - 'cli_input': './sushichef.py --token=letoken somethin=else extrakey=extraval', - 'expected_args': dict(defaults, token='letoken'), - 'expected_options': dict(somethin='else', extrakey='extraval') + "cli_input": "./sushichef.py --token=letoken somethin=else extrakey=extraval", + "expected_args": dict(defaults, token="letoken"), + "expected_options": dict(somethin="else", extrakey="extraval"), }, { - 'cli_input': './sushichef.py -uv --warn --compress --download-attempts=4 --token=besttokenever --resume --step=PUBLISH_CHANNEL --prompt --deploy --publish', - 'expected_args': dict(defaults, - update=True, - warn=True, compress=True, - download_attempts=4, token='besttokenever', resume=True, step='PUBLISH_CHANNEL', - prompt=True, stage=False, publish=True), - 'expected_options': {} + "cli_input": "./sushichef.py -uv --warn --compress --download-attempts=4 --token=besttokenever --resume --step=PUBLISH_CHANNEL --prompt --deploy --publish", + "expected_args": dict( + defaults, + update=True, + warn=True, + compress=True, + download_attempts=4, + token="besttokenever", + resume=True, + step="PUBLISH_CHANNEL", + prompt=True, + stage=False, + publish=True, + ), + "expected_options": {}, }, ] @@ -64,22 +81,22 @@ def chef_arg_parser(cli_input): Takes a string `cli_input` and parses it using the SushiChef arg parser. Returns tuple of args and options. """ - test_argv = cli_input.split(' ') - with patch.object(sys, 'argv', test_argv): + test_argv = cli_input.split(" ") + with patch.object(sys, "argv", test_argv): chef = SushiChef() args, options = chef.parse_args_and_options() - assert args is not None, 'argparse parsing failed' + assert args is not None, "argparse parsing failed" return args, options - """ *********** CLI ARGUMENTS TESTS *********** """ + def test_basic_command_line_args_and_options(cli_args_and_expected): for case in cli_args_and_expected: - cli_input = case['cli_input'] - expected_args = case['expected_args'] - expected_options = case['expected_options'] + cli_input = case["cli_input"] + expected_args = case["expected_args"] + expected_options = case["expected_options"] args, options = chef_arg_parser(cli_input) @@ -93,12 +110,11 @@ def test_basic_command_line_args_and_options(cli_args_and_expected): def test_cannot_publish_without_deploy(): - bad_cli_input = './sushichef.py --token=letoken --publish' + bad_cli_input = "./sushichef.py --token=letoken --publish" with pytest.raises(InvalidUsageException): args, options = chef_arg_parser(bad_cli_input) - good_cli_input = './sushichef.py --token=letoken --deploy --publish' + good_cli_input = "./sushichef.py --token=letoken --deploy --publish" args, options = chef_arg_parser(good_cli_input) - assert args['stage'] == False - assert args['publish'] == True - + assert args["stage"] == False + assert args["publish"] == True diff --git a/tests/test_csv_metadata.py b/tests/test_csv_metadata.py index 7f55f609..4ae73bfb 100644 --- a/tests/test_csv_metadata.py +++ b/tests/test_csv_metadata.py @@ -1,30 +1,36 @@ """ Tests for CSV exercises channel logic """ import os -import pytest import tempfile +import pytest + from ricecooker.chefs import LineCook from ricecooker.utils.jsontrees import read_tree_from_json from ricecooker.utils.metadata_provider import CsvMetadataProvider - @pytest.fixture def channeldir(): - return os.path.join('tests', 'testchannels', 'csv_channel_with_exercises', 'channeldir') + return os.path.join( + "tests", "testchannels", "csv_channel_with_exercises", "channeldir" + ) def test_exercises_metadata_provider(channeldir): _, channeldirname = os.path.split(channeldir) mp = CsvMetadataProvider(channeldir) - assert mp is not None, 'CsvMetadataProvider does not exist' + assert mp is not None, "CsvMetadataProvider does not exist" mp.validate_headers() - assert mp.has_exercises(), 'has exercises' - assert mp.get_channel_info()['source_id'] == 'csv_channel_with_exercises', 'check source id' + assert mp.has_exercises(), "has exercises" + assert ( + mp.get_channel_info()["source_id"] == "csv_channel_with_exercises" + ), "check source id" # - assert len(mp.contentcache.keys()) == 8, 'Found too many items' - assert len(mp.get_exercises_for_dir((channeldirname,))) == 1, 'one exercise in root' - assert len(mp.get_exercises_for_dir((channeldirname,'exercises'))) == 3, '3 exercise in exercises/' + assert len(mp.contentcache.keys()) == 8, "Found too many items" + assert len(mp.get_exercises_for_dir((channeldirname,))) == 1, "one exercise in root" + assert ( + len(mp.get_exercises_for_dir((channeldirname, "exercises"))) == 3 + ), "3 exercise in exercises/" def test_exercises_linecook(channeldir): @@ -32,26 +38,24 @@ def test_exercises_linecook(channeldir): linecook = LineCook() linecook.TREES_DATA_DIR = tmpdir_path - linecook.RICECOOKER_JSON_TREE = 'test_ricecooker_json_tree.json' + linecook.RICECOOKER_JSON_TREE = "test_ricecooker_json_tree.json" args = dict( channeldir=channeldir, - channelinfo='Channel.csv', - contentinfo='Content.csv', - exercisesinfo='Exercises.csv', - questionsinfo='ExerciseQuestions.csv', - token='???', + channelinfo="Channel.csv", + contentinfo="Content.csv", + exercisesinfo="Exercises.csv", + questionsinfo="ExerciseQuestions.csv", + token="???", ) options = {} linecook.pre_run(args, options) - + jsontree_path = os.path.join(tmpdir_path, linecook.RICECOOKER_JSON_TREE) - assert os.path.exists(jsontree_path), 'output json exists' + assert os.path.exists(jsontree_path), "output json exists" test_tree = read_tree_from_json(jsontree_path) - assert len(test_tree['children']) == 3, 'exercise node + two dirs' + assert len(test_tree["children"]) == 3, "exercise node + two dirs" # cleanup os.remove(jsontree_path) os.rmdir(tmpdir_path) - - diff --git a/tests/test_data.py b/tests/test_data.py index 1f3a07b2..6f8d9e11 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -1,17 +1,23 @@ """ Tests for data validation and construction """ - import json import os -import pytest import uuid -from le_utils.constants import licenses, content_kinds, exercises, roles -from ricecooker.classes.nodes import ChannelNode, TopicNode -from ricecooker.exceptions import InvalidNodeException, InvalidQuestionException +import pytest +from le_utils.constants import content_kinds +from le_utils.constants import exercises +from le_utils.constants import licenses +from le_utils.constants import roles + +from ricecooker.classes.nodes import ChannelNode +from ricecooker.classes.nodes import TopicNode +from ricecooker.exceptions import InvalidNodeException +from ricecooker.exceptions import InvalidQuestionException """ *********** CHANNEL TESTS *********** """ + def test_init(channel, topic, video, audio, document, html, exercise): # Channel init assert channel, "Channel was not created" @@ -28,8 +34,26 @@ def test_init(channel, topic, video, audio, document, html, exercise): assert html, "HTML was not created" assert exercise, "Exercise was not created" -def test_validate(channel, invalid_channel, topic, contentnode_invalid_license, contentnode_invalid_files, contentnode_no_source_id, video, video_invalid_files, - audio, audio_invalid_files, document, document_invalid_files, html, html_invalid_files, html_invalid_zip, exercise, exercise_invalid_question): + +def test_validate( + channel, + invalid_channel, + topic, + contentnode_invalid_license, + contentnode_invalid_files, + contentnode_no_source_id, + video, + video_invalid_files, + audio, + audio_invalid_files, + document, + document_invalid_files, + html, + html_invalid_files, + html_invalid_zip, + exercise, + exercise_invalid_question, +): assert channel.validate(), "Valid channel should pass validation" pytest.raises(InvalidNodeException, invalid_channel.validate) assert topic.validate(), "Valid topics should pass validation" @@ -51,123 +75,176 @@ def test_validate(channel, invalid_channel, topic, contentnode_invalid_license, pytest.raises(InvalidQuestionException, exercise_invalid_question.validate) - - """ *********** ALT DOMAIN TESTS *********** """ + @pytest.fixture def topic_domain_namespace(channel_domain_namespace): - return uuid.uuid5(channel_domain_namespace, 'alt-source-id') + return uuid.uuid5(channel_domain_namespace, "alt-source-id") + @pytest.fixture def topic_alt_content_id(topic_domain_namespace): - return uuid.uuid5(topic_domain_namespace, 'test-alt') + return uuid.uuid5(topic_domain_namespace, "test-alt") + @pytest.fixture def topic_alt_node_id(channel_node_id, topic_alt_content_id): return uuid.uuid5(channel_node_id, topic_alt_content_id.hex) + @pytest.fixture def topic_alternative_domain(topic_domain_namespace, title, channel): - topic = TopicNode('test-alt', title, domain_ns=topic_domain_namespace) + topic = TopicNode("test-alt", title, domain_ns=topic_domain_namespace) channel.add_child(topic) return topic -def test_alternative_domain_namespace(topic_alternative_domain, topic_domain_namespace, topic_alt_node_id, topic_alt_content_id): - assert topic_alternative_domain.get_domain_namespace() == topic_domain_namespace, "Topic domain should be {}".format(topic_domain_namespace) - assert topic_alternative_domain.get_content_id() == topic_alt_content_id, "Topic content id should be {}".format(topic_alt_content_id) - assert topic_alternative_domain.get_node_id() == topic_alt_node_id, "Topic node id should be {}".format(topic_alt_node_id) - +def test_alternative_domain_namespace( + topic_alternative_domain, + topic_domain_namespace, + topic_alt_node_id, + topic_alt_content_id, +): + assert ( + topic_alternative_domain.get_domain_namespace() == topic_domain_namespace + ), "Topic domain should be {}".format(topic_domain_namespace) + assert ( + topic_alternative_domain.get_content_id() == topic_alt_content_id + ), "Topic content id should be {}".format(topic_alt_content_id) + assert ( + topic_alternative_domain.get_node_id() == topic_alt_node_id + ), "Topic node id should be {}".format(topic_alt_node_id) """ *********** TO_DICT TESTS *********** """ + def test_channel_to_dict(channel, channel_data): channel_dict = channel.to_dict().items() - assert len(channel_dict) == len(channel_data.items()), "Channel to_dict does not have the expected number of fields" + assert len(channel_dict) == len( + channel_data.items() + ), "Channel to_dict does not have the expected number of fields" for key, value in channel_dict: - assert value == channel_data[key], "Mismatched {}: {} != {}".format(key, value, channel_data[key]) + assert value == channel_data[key], "Mismatched {}: {} != {}".format( + key, value, channel_data[key] + ) + def test_topic_to_dict(topic, topic_data): topic_dict = topic.to_dict() - topic_data['extra_fields'] = json.dumps(topic_data['extra_fields']) + topic_data["extra_fields"] = json.dumps(topic_data["extra_fields"]) for key, _ in topic_data.items(): - assert key in topic_dict, "Key {} is not found in topic to_dict method".format(key) + assert key in topic_dict, "Key {} is not found in topic to_dict method".format( + key + ) for key, value in topic_dict.items(): - assert value == topic_data.get(key), "Mismatched {}: {} != {}".format(key, value, topic_data[key]) + assert value == topic_data.get(key), "Mismatched {}: {} != {}".format( + key, value, topic_data[key] + ) + def test_video_to_dict(video, video_data): video_dict = video.to_dict() - video_dict.pop('files') - expected_files = video_data.pop('files') - video_data['extra_fields'] = json.dumps(video_data['extra_fields']) + video_dict.pop("files") + expected_files = video_data.pop("files") + video_data["extra_fields"] = json.dumps(video_data["extra_fields"]) assert video.files == expected_files, "Video files do not match" for key, _ in video_data.items(): assert key in video_dict, "Key {} is not found in to_dict method".format(key) for key, value in video_dict.items(): - assert value == video_data.get(key), "Mismatched {}: {} != {}".format(key, value, video_data[key]) + assert value == video_data.get(key), "Mismatched {}: {} != {}".format( + key, value, video_data[key] + ) + def test_audio_to_dict(audio, audio_data): audio_dict = audio.to_dict() - audio_dict.pop('files') - expected_files = audio_data.pop('files') - audio_data['extra_fields'] = json.dumps(audio_data['extra_fields']) + audio_dict.pop("files") + expected_files = audio_data.pop("files") + audio_data["extra_fields"] = json.dumps(audio_data["extra_fields"]) assert audio.files == expected_files, "Audio files do not match" for key, _ in audio_data.items(): assert key in audio_dict, "Key {} is not found in to_dict method".format(key) for key, value in audio_dict.items(): - assert value == audio_data.get(key), "Mismatched {}: {} != {}".format(key, value, audio_data[key]) + assert value == audio_data.get(key), "Mismatched {}: {} != {}".format( + key, value, audio_data[key] + ) + def test_document_to_dict(document, document_data): document_dict = document.to_dict() - document_dict.pop('files') - expected_files = document_data.pop('files') - document_data['extra_fields'] = json.dumps(document_data['extra_fields']) + document_dict.pop("files") + expected_files = document_data.pop("files") + document_data["extra_fields"] = json.dumps(document_data["extra_fields"]) assert document.files == expected_files, "Document files do not match" for key, _ in document_data.items(): assert key in document_dict, "Key {} is not found in to_dict method".format(key) for key, value in document_dict.items(): - assert value == document_data.get(key), "Mismatched {}: {} != {}".format(key, value, document_data[key]) + assert value == document_data.get(key), "Mismatched {}: {} != {}".format( + key, value, document_data[key] + ) + def test_html_to_dict(html, html_data): html_dict = html.to_dict() - html_dict.pop('files') - expected_files = html_data.pop('files') - html_data['extra_fields'] = json.dumps(html_data['extra_fields']) + html_dict.pop("files") + expected_files = html_data.pop("files") + html_data["extra_fields"] = json.dumps(html_data["extra_fields"]) assert html.files == expected_files, "HTML files do not match" for key, _ in html_data.items(): assert key in html_dict, "Key {} is not found in to_dict method".format(key) for key, value in html_dict.items(): - assert value == html_data.get(key), "Mismatched {}: {} != {}".format(key, value, html_data[key]) + assert value == html_data.get(key), "Mismatched {}: {} != {}".format( + key, value, html_data[key] + ) + def test_exercise_to_dict(exercise, exercise_data): exercise_dict = exercise.to_dict() - exercise_dict.pop('questions') - the_exercise_data = json.loads(exercise_dict['extra_fields']) - assert the_exercise_data == exercise_data['extra_fields'], 'Different extra_fields found' - del exercise_dict['extra_fields'] - del exercise_data['extra_fields'] - assert exercise.questions == exercise_data.pop('questions'), "Exercise questions do not match" + exercise_dict.pop("questions") + the_exercise_data = json.loads(exercise_dict["extra_fields"]) + assert ( + the_exercise_data == exercise_data["extra_fields"] + ), "Different extra_fields found" + del exercise_dict["extra_fields"] + del exercise_data["extra_fields"] + assert exercise.questions == exercise_data.pop( + "questions" + ), "Exercise questions do not match" for key, _ in exercise_data.items(): assert key in exercise_dict, "Key {} is not found in to_dict method".format(key) for key, value in exercise_dict.items(): - assert value == exercise_data.get(key), "Mismatched {}: {} != {}".format(key, value, exercise_data[key]) + assert value == exercise_data.get(key), "Mismatched {}: {} != {}".format( + key, value, exercise_data[key] + ) + def test_slideshow_to_dict(slideshow, slideshow_data): slideshow_dict = slideshow.to_dict() - extra_fields = json.loads(slideshow_dict['extra_fields']) - assert len(extra_fields['slideshow_data']) == 10, 'wrong num slides' - expected_field_keys = { 'caption', 'descriptive_text', 'checksum', 'sort_order', 'extension'} - assert all([set(sd.keys()) == expected_field_keys for sd in extra_fields['slideshow_data']]), 'extra_fields is missing expected fields' - del slideshow_data['extra_fields'] - del slideshow_dict['extra_fields'] + extra_fields = json.loads(slideshow_dict["extra_fields"]) + assert len(extra_fields["slideshow_data"]) == 10, "wrong num slides" + expected_field_keys = { + "caption", + "descriptive_text", + "checksum", + "sort_order", + "extension", + } + assert all( + [set(sd.keys()) == expected_field_keys for sd in extra_fields["slideshow_data"]] + ), "extra_fields is missing expected fields" + del slideshow_data["extra_fields"] + del slideshow_dict["extra_fields"] # - expected_files = slideshow_data.pop('files') - slideshow_dict.pop('files') + expected_files = slideshow_data.pop("files") + slideshow_dict.pop("files") assert slideshow.files == expected_files, "slideshow_images do not match" for key, _ in slideshow_data.items(): - assert key in slideshow_dict, "Key {} is not found in to_dict method".format(key) + assert key in slideshow_dict, "Key {} is not found in to_dict method".format( + key + ) for key, value in slideshow_dict.items(): - assert value == slideshow_data.get(key), "Mismatched {}: {} != {}".format(key, value, slideshow_data[key]) - + assert value == slideshow_data.get(key), "Mismatched {}: {} != {}".format( + key, value, slideshow_data[key] + ) diff --git a/tests/test_downloader.py b/tests/test_downloader.py index 6b953373..911e08a6 100644 --- a/tests/test_downloader.py +++ b/tests/test_downloader.py @@ -6,58 +6,67 @@ class TestArchiver(unittest.TestCase): def test_get_archive_filename_absolute(self): - link = 'https://learningequality.org/kolibri.png' + link = "https://learningequality.org/kolibri.png" urls_to_replace = {} - result = downloader.get_archive_filename(link, download_root='./', resource_urls=urls_to_replace) + result = downloader.get_archive_filename( + link, download_root="./", resource_urls=urls_to_replace + ) - expected = os.path.join('learningequality.org', 'kolibri.png') + expected = os.path.join("learningequality.org", "kolibri.png") assert result == expected assert urls_to_replace[link] == expected def test_get_archive_filename_relative(self): - link = '../kolibri.png' - page_link = 'https://learningequality.org/team/index.html' + link = "../kolibri.png" + page_link = "https://learningequality.org/team/index.html" urls_to_replace = {} - result = downloader.get_archive_filename(link, page_url=page_link, - download_root='./', resource_urls=urls_to_replace) + result = downloader.get_archive_filename( + link, page_url=page_link, download_root="./", resource_urls=urls_to_replace + ) - expected = os.path.join('learningequality.org', 'kolibri.png') + expected = os.path.join("learningequality.org", "kolibri.png") assert result == expected assert urls_to_replace[link] == expected def test_get_archive_filename_with_query(self): - link = '../kolibri.png?1.2.3' - page_link = 'https://learningequality.org/team/index.html' + link = "../kolibri.png?1.2.3" + page_link = "https://learningequality.org/team/index.html" urls_to_replace = {} - result = downloader.get_archive_filename(link, page_url=page_link, - download_root='./', resource_urls=urls_to_replace) + result = downloader.get_archive_filename( + link, page_url=page_link, download_root="./", resource_urls=urls_to_replace + ) - expected = os.path.join('learningequality.org', 'kolibri_1.2.3.png') + expected = os.path.join("learningequality.org", "kolibri_1.2.3.png") assert result == expected assert urls_to_replace[link] == expected - link = '../kolibri.png?v=1.2.3&i=u' - page_link = 'https://learningequality.org/team/index.html' + link = "../kolibri.png?v=1.2.3&i=u" + page_link = "https://learningequality.org/team/index.html" urls_to_replace = {} - result = downloader.get_archive_filename(link, page_url=page_link, - download_root='./', resource_urls=urls_to_replace) + result = downloader.get_archive_filename( + link, page_url=page_link, download_root="./", resource_urls=urls_to_replace + ) - expected = os.path.join('learningequality.org', 'kolibri_v_1.2.3_i_u.png') + expected = os.path.join("learningequality.org", "kolibri_v_1.2.3_i_u.png") assert result == expected assert urls_to_replace[link] == expected def test_archive_path_as_relative_url(self): - link = '../kolibri.png?1.2.3' - page_link = 'https://learningequality.org/team/index.html' - page_filename = downloader.get_archive_filename(page_link, download_root='./') - link_filename = downloader.get_archive_filename(link, page_url=page_link, download_root='./') - rel_path = downloader.get_relative_url_for_archive_filename(link_filename, page_filename) - assert rel_path == '../kolibri_1.2.3.png' + link = "../kolibri.png?1.2.3" + page_link = "https://learningequality.org/team/index.html" + page_filename = downloader.get_archive_filename(page_link, download_root="./") + link_filename = downloader.get_archive_filename( + link, page_url=page_link, download_root="./" + ) + rel_path = downloader.get_relative_url_for_archive_filename( + link_filename, page_filename + ) + assert rel_path == "../kolibri_1.2.3.png" diff --git a/tests/test_exercises.py b/tests/test_exercises.py index 32c13ff4..689ff586 100644 --- a/tests/test_exercises.py +++ b/tests/test_exercises.py @@ -1,84 +1,100 @@ """ Tests for exercise nodes, questions, and files """ import os -import pytest import re -import uuid import tempfile -from le_utils.constants import licenses, content_kinds, exercises +import uuid + +import pytest +from le_utils.constants import content_kinds +from le_utils.constants import exercises +from le_utils.constants import licenses +from test_videos import _clear_ricecookerfilecache + from ricecooker.classes.nodes import * -from ricecooker.classes.questions import BaseQuestion, PerseusQuestion, SingleSelectQuestion +from ricecooker.classes.questions import BaseQuestion +from ricecooker.classes.questions import PerseusQuestion +from ricecooker.classes.questions import SingleSelectQuestion from ricecooker.config import STORAGE_DIRECTORY -from test_videos import _clear_ricecookerfilecache TESTS_DIR = os.path.dirname(os.path.abspath(__file__)) -TESTCONTENT_DIR = os.path.join(TESTS_DIR, 'testcontent') +TESTCONTENT_DIR = os.path.join(TESTS_DIR, "testcontent") """ *********** EXERCISE FIXTURES *********** """ + + @pytest.fixture def exercise_id(): return "exercise-id" + @pytest.fixture def channel_internal_domain(): - return "learningequality.org".encode('utf-8') + return "learningequality.org".encode("utf-8") + @pytest.fixture def topic_node_id(): - return 'some-node-id' + return "some-node-id" + @pytest.fixture def exercise_content_id(channel_internal_domain, exercise_id): return uuid.uuid5(channel_internal_domain, exercise_id) + @pytest.fixture def exercise_node_id(topic_node_id, exercise_content_id): return uuid.uuid5(topic_node_id, exercise_content_id.hex) + @pytest.fixture def exercise_data(exercise_id): return { - "title": "exercise node test", - "description": None, - "id" : exercise_id, + "title": "exercise node test", + "description": None, + "id": exercise_id, "author": None, "license": licenses.PUBLIC_DOMAIN, } + @pytest.fixture def exercise_questions(): return [ - SingleSelectQuestion( - id='123', - question='What is your quest?', - correct_answer='To spectacularly fail', - all_answers=[ - 'To seek the grail', - 'To eat some hail', - 'To spectacularly fail', - 'To post bail' - ] - ) - ] + SingleSelectQuestion( + id="123", + question="What is your quest?", + correct_answer="To spectacularly fail", + all_answers=[ + "To seek the grail", + "To eat some hail", + "To spectacularly fail", + "To post bail", + ], + ) + ] + @pytest.fixture def exercise(exercise_data, channel_internal_domain, topic_node_id, exercise_questions): node = ExerciseNode( - source_id=exercise_data['id'], + source_id=exercise_data["id"], # description=exercise_data['description'], - title=exercise_data['title'], - author=exercise_data['author'], - license=exercise_data['license'], - questions=exercise_questions + title=exercise_data["title"], + author=exercise_data["author"], + license=exercise_data["license"], + questions=exercise_questions, ) # node.set_ids(channel_internal_domain, topic_node_id) return node + @pytest.fixture def exercise_json(exercise_data, exercise_content_id, exercise_node_id): return { - "id" : exercise_data['id'], - "title": exercise_data['title'], + "id": exercise_data["id"], + "title": exercise_data["title"], "description": "", "node_id": exercise_node_id.hex, "content_id": exercise_content_id.hex, @@ -86,28 +102,28 @@ def exercise_json(exercise_data, exercise_content_id, exercise_node_id): "children": [], "files": [], "kind": exercises.PERSEUS_QUESTION, - "license": exercise_data['license'], + "license": exercise_data["license"], } """ *********** EXERCISE TESTS *********** """ + + def test_exercise_created(exercise): assert exercise is not None + def test_exercise_validate(exercise, exercise_data): - assert exercise.source_id == exercise_data['id'] - assert exercise.title == exercise_data['title'] + assert exercise.source_id == exercise_data["id"] + assert exercise.title == exercise_data["title"] # assert exercise.description == exercise_data['description'] # assert exercise.author == exercise_data['author'] # assert exercise.license == exercise_data['license'] # assert exercise.kind == exercises.PERSEUS_QUESTION + def test_exercise_extra_fields_string(exercise): - exercise.extra_fields = { - 'mastery_model': exercises.M_OF_N, - 'm': '3', - 'n': '5' - } + exercise.extra_fields = {"mastery_model": exercises.M_OF_N, "m": "3", "n": "5"} # validate should call process_exercise_data, which will convert the values to # integers and validate values after that. @@ -115,15 +131,11 @@ def test_exercise_extra_fields_string(exercise): # conversion tools may fail to properly convert these fields to int values, # so make sure an int string gets read as a string. - assert exercise.extra_fields['m'] == 3 - assert exercise.extra_fields['n'] == 5 + assert exercise.extra_fields["m"] == 3 + assert exercise.extra_fields["n"] == 5 # Make sure we throw an error if we have non-int strings - exercise.extra_fields = { - 'mastery_model': exercises.M_OF_N, - 'm': '3.0', - 'n': '5.1' - } + exercise.extra_fields = {"mastery_model": exercises.M_OF_N, "m": "3.0", "n": "5.1"} with pytest.raises(ValueError): exercise.process_files() @@ -133,9 +145,9 @@ def test_exercise_extra_fields_string(exercise): # or any other type of string... exercise.extra_fields = { - 'mastery_model': exercises.M_OF_N, - 'm': 'three', - 'n': 'five' + "mastery_model": exercises.M_OF_N, + "m": "three", + "n": "five", } with pytest.raises(ValueError): @@ -144,20 +156,18 @@ def test_exercise_extra_fields_string(exercise): with pytest.raises(InvalidNodeException): exercise.validate() + def test_exercise_extra_fields_float(exercise): - exercise.extra_fields = { - 'mastery_model': exercises.M_OF_N, - 'm': 3.0, - 'n': 5.6 - } + exercise.extra_fields = {"mastery_model": exercises.M_OF_N, "m": 3.0, "n": 5.6} exercise.process_files() # ensure the fields end up as pure ints, using floor. - assert exercise.extra_fields['m'] == 3 - assert exercise.extra_fields['n'] == 5 + assert exercise.extra_fields["m"] == 3 + assert exercise.extra_fields["n"] == 5 exercise.validate() + # # def test_exercise_to_dict(exercise): # assert exercise.default_preset == exercises.PERSEUS_QUESTION @@ -173,127 +183,168 @@ def test_exercise_extra_fields_float(exercise): """ *********** BASE64FILE TESTS *********** """ + + def test_base64_process_file(): assert True + def test_base64_validate(): assert True + def test_base64_to_dict(): assert True + def test_base64_convert_base64_to_file(): assert True """ *********** EXERCISEBASE64FILE TESTS *********** """ + + def test_exercisebase64_process_file(): assert True + def test_exercisebase64_validate(): assert True + def test_exercisebase64_to_dict(): assert True + def test_exercisebase64_get_replacement_str(): assert True """ *********** EXERCISEIMAGEFILE TESTS *********** """ + + def test_exerciseimage_process_file(): assert True + def test_exerciseimage_validate(): assert True + def test_exerciseimage_to_dict(): assert True + def test_exerciseimage_get_replacement_str(): assert True """ *********** EXERCISEGRAPHIEFILE TESTS *********** """ + + def test_exercisegraphie_process_file(): assert True + def test_exercisegraphie_validate(): assert True + def test_exercisegraphie_to_dict(): assert True + def test_exercisegraphie_get_replacement_str(): assert True + def test_exercisegraphie_generate_graphie_file(): assert True """ *********** QUESTION TESTS *********** """ + + def test_question_to_dict(): assert True + def test_question_create_answer(): assert True + def test_question_process_question(): assert True + def test_question_set_images(): assert True + def test_question_parse_html(): assert True + def test_question_set_image(): assert True + def test_question_validate(): assert True """ *********** PERSEUSQUESTION TESTS *********** """ + + def test_perseusquestion_to_dict(): assert True + def test_perseusquestion_validate(): assert True + def test_perseusquestion_process_question(): assert True + def test_perseusquestion_process_image_field(): assert True """ *********** MULTIPLESELECTQUESTION TESTS *********** """ + + def test_multipleselectquestion_to_dict(): assert True + def test_multipleselectquestion_validate(): assert True """ *********** SINGLESELECTQUESTION TESTS *********** """ + + def test_singleselectquestion_to_dict(): assert True + def test_singleselectquestion_validate(): assert True """ *********** INPUTQUESTION TESTS *********** """ + + def test_inputquestion_to_dict(): assert True + def test_inputquestion_validate(): assert True - - ################################################################################ # Perseus image asset processing and image loading tests ################################################################################ @@ -302,6 +353,7 @@ def test_inputquestion_validate(): # Regex tests ################################################################################ + @pytest.fixture def graphie_strings_and_rawpath(): """ @@ -309,21 +361,25 @@ def graphie_strings_and_rawpath(): WEB_GRAPHIE_URL_REGEX = r'web\+graphie:(?P[^\)]+)' """ test_data = { - '![](web+graphie:somechunk)': 'somechunk', - 'alksjalksj ![](web+graphie:somechunk)': 'somechunk', - '![](web+graphie:http://yahoo.com/path/url.png)': 'http://yahoo.com/path/url.png', - '![graph](web+graphie://ka.s3.aws.com/fefe)': '//ka.s3.aws.com/fefe', + "![](web+graphie:somechunk)": "somechunk", + "alksjalksj ![](web+graphie:somechunk)": "somechunk", + "![](web+graphie:http://yahoo.com/path/url.png)": "http://yahoo.com/path/url.png", + "![graph](web+graphie://ka.s3.aws.com/fefe)": "//ka.s3.aws.com/fefe", } return test_data + def test_WEB_GRAPHIE_URL_REGEX_matches(graphie_strings_and_rawpath): from ricecooker.classes.questions import WEB_GRAPHIE_URL_REGEX - pat = re.compile(WEB_GRAPHIE_URL_REGEX, flags=re.IGNORECASE) + + pat = re.compile(WEB_GRAPHIE_URL_REGEX, flags=re.IGNORECASE) for sample_str, expected_rawpath in graphie_strings_and_rawpath.items(): m = pat.search(sample_str) - rawpath = m.groupdict()['rawpath'] - assert m, 'WEB_GRAPHIE_URL_REGEX failed to match string ' + sample_str - assert rawpath == expected_rawpath, 'found ' + rawpath + ' expected ' + expected_rawpath + rawpath = m.groupdict()["rawpath"] + assert m, "WEB_GRAPHIE_URL_REGEX failed to match string " + sample_str + assert rawpath == expected_rawpath, ( + "found " + rawpath + " expected " + expected_rawpath + ) @pytest.fixture @@ -333,65 +389,79 @@ def markdown_link_strings_and_match(): MARKDOWN_IMAGE_REGEX = r'!\[([^\]]+)?\]\(([^\)]+)\)' """ test_data = { - '![smth](path)': ('smth', 'path'), - 'blah ![smth](path) bof': ('smth', 'path'), - '![smth](http://url.org/path/file.png)': ('smth', 'http://url.org/path/file.png'), - '![smth](https://url.org/path/file.png)': ('smth', 'https://url.org/path/file.png'), - '![smth](//url.org/path/file.png)': ('smth', '//url.org/path/file.png'), - '![smth](web+graphie://ka.s3.aws.com/fefe)': ('smth', 'web+graphie://ka.s3.aws.com/fefe'), + "![smth](path)": ("smth", "path"), + "blah ![smth](path) bof": ("smth", "path"), + "![smth](http://url.org/path/file.png)": ( + "smth", + "http://url.org/path/file.png", + ), + "![smth](https://url.org/path/file.png)": ( + "smth", + "https://url.org/path/file.png", + ), + "![smth](//url.org/path/file.png)": ("smth", "//url.org/path/file.png"), + "![smth](web+graphie://ka.s3.aws.com/fefe)": ( + "smth", + "web+graphie://ka.s3.aws.com/fefe", + ), } return test_data + def test_MARKDOWN_IMAGE_REGEX_matches(markdown_link_strings_and_match): from ricecooker.classes.questions import MARKDOWN_IMAGE_REGEX - pat = re.compile(MARKDOWN_IMAGE_REGEX, flags=re.IGNORECASE) + + pat = re.compile(MARKDOWN_IMAGE_REGEX, flags=re.IGNORECASE) for sample_str, expected_matches in markdown_link_strings_and_match.items(): m = pat.search(sample_str) - assert m, 'MARKDOWN_IMAGE_REGEX failed to match string ' + sample_str - assert m.groups() == expected_matches, 'found ' + m.groups() + ' expected ' + expected_matches - - - - + assert m, "MARKDOWN_IMAGE_REGEX failed to match string " + sample_str + assert m.groups() == expected_matches, ( + "found " + m.groups() + " expected " + expected_matches + ) ## Tests to make sure BaseQuestion.set_image works correctly ################################################################################ + @pytest.fixture def image_texts_fixtures(): """ Return texts and corresponding content hashes for various types of image resources. """ - WEB_GRAPHIE_PREFIX = 'web+graphie:${☣ CONTENTSTORAGE}/' - WEB_PREFIX = '${☣ CONTENTSTORAGE}/' + WEB_GRAPHIE_PREFIX = "web+graphie:${☣ CONTENTSTORAGE}/" + WEB_PREFIX = "${☣ CONTENTSTORAGE}/" test_data = [ { - 'text': 'web+graphie://ka-perseus-graphie.s3.amazonaws.com/eb3f3bf7c317408ee90995b5bcf4f3a59606aedd', - 'replacement_str': WEB_GRAPHIE_PREFIX + 'eb3f3bf7c317408ee90995b5bcf4f3a59606aedd', - 'hash': 'ea2269bb5cf487f8d883144b9c06fbc7' + "text": "web+graphie://ka-perseus-graphie.s3.amazonaws.com/eb3f3bf7c317408ee90995b5bcf4f3a59606aedd", + "replacement_str": WEB_GRAPHIE_PREFIX + + "eb3f3bf7c317408ee90995b5bcf4f3a59606aedd", + "hash": "ea2269bb5cf487f8d883144b9c06fbc7", }, { - 'text': 'web+graphie://ka-perseus-graphie.s3.amazonaws.com/d8daa074ec7d09ce3819d6259b3e4670701d2540', - 'replacement_str': WEB_GRAPHIE_PREFIX + 'd8daa074ec7d09ce3819d6259b3e4670701d2540', - 'hash': 'db98ca9d35b2fb97cde378a1fabddd26' + "text": "web+graphie://ka-perseus-graphie.s3.amazonaws.com/d8daa074ec7d09ce3819d6259b3e4670701d2540", + "replacement_str": WEB_GRAPHIE_PREFIX + + "d8daa074ec7d09ce3819d6259b3e4670701d2540", + "hash": "db98ca9d35b2fb97cde378a1fabddd26", }, { - 'text': 'https://learningequality.org/static/img/le-logo.svg', - 'replacement_str': WEB_PREFIX + '52b097901664f83e6b7c92ae1af1721b.svg', - 'hash': '52b097901664f83e6b7c92ae1af1721b', - }, - { - 'text': 'https://learningequality.org/static/img/no-wifi.png', - 'replacement_str': WEB_PREFIX + '599aa896313be22dea6c0257772a464e.png', - 'hash': '599aa896313be22dea6c0257772a464e' - }, - { # slightly modified version of the above - 'text': os.path.relpath(os.path.join(TESTCONTENT_DIR, 'exercises', 'no-wifi.png')), - 'replacement_str': WEB_PREFIX + '599aa896313be22dea6c0257772a464e.png', - 'hash': '599aa896313be22dea6c0257772a464e' - }, + "text": "https://learningequality.org/static/img/le-logo.svg", + "replacement_str": WEB_PREFIX + "52b097901664f83e6b7c92ae1af1721b.svg", + "hash": "52b097901664f83e6b7c92ae1af1721b", + }, + { + "text": "https://learningequality.org/static/img/no-wifi.png", + "replacement_str": WEB_PREFIX + "599aa896313be22dea6c0257772a464e.png", + "hash": "599aa896313be22dea6c0257772a464e", + }, + { # slightly modified version of the above + "text": os.path.relpath( + os.path.join(TESTCONTENT_DIR, "exercises", "no-wifi.png") + ), + "replacement_str": WEB_PREFIX + "599aa896313be22dea6c0257772a464e.png", + "hash": "599aa896313be22dea6c0257772a464e", + }, ] return test_data @@ -405,42 +475,48 @@ def test_base_question_set_image(image_texts_fixtures): for datum in image_texts_fixtures: # setup _clear_ricecookerfilecache() # clear file cache each time to avoid test interactions - text = datum['text'] - replacement_str = datum['replacement_str'] - + text = datum["text"] + replacement_str = datum["replacement_str"] # SIT ################################################################## - testq = BaseQuestion(id='someid', question='somequestion', question_type='input', raw_data={}) + testq = BaseQuestion( + id="someid", question="somequestion", question_type="input", raw_data={} + ) new_text, images = testq.set_image(text) # check 1 - assert new_text == replacement_str, 'Unexpected replacement text produced by set_image' + assert ( + new_text == replacement_str + ), "Unexpected replacement text produced by set_image" # check 2 - assert len(images) == 1, 'Should find exactly one image' + assert len(images) == 1, "Should find exactly one image" # check 3 image_file = images[0] filename = image_file.get_filename() - assert datum['hash'] in filename, 'wront content hash for file' + assert datum["hash"] in filename, "wront content hash for file" # print('filename=', filename) - if text.startswith('web+graphie:'): - assert new_text.startswith('web+graphie:'), 'web+graphie: was lost' - assert filename.endswith('.graphie'), 'wrong extension for web+graphie text' + if text.startswith("web+graphie:"): + assert new_text.startswith("web+graphie:"), "web+graphie: was lost" + assert filename.endswith(".graphie"), "wrong extension for web+graphie text" expected_storage_dir = os.path.join(STORAGE_DIRECTORY, filename[0], filename[1]) expected_storage_path = os.path.join(expected_storage_dir, filename) - assert os.path.exists(expected_storage_path), 'Image file not saved to ricecooker storage dir' + assert os.path.exists( + expected_storage_path + ), "Image file not saved to ricecooker storage dir" # Test _recursive_url_find method ################################################################################ + def test_perseus__recursive_url_find(persues_question_json_fixtures): """ Run _recursive_url_find to check it correctly recognizes and rewrites `url` fields. """ # fixtures - sample_data_with_backgroundImage_url = { + sample_data_with_backgroundImage_url = { "question": { "content": "[[☃ interactive-graph 1]]\n\n", "images": {}, @@ -451,46 +527,46 @@ def test_perseus__recursive_url_find(persues_question_json_fixtures): "static": False, "graded": True, "options": { - "step": [1,1], + "step": [1, 1], "backgroundImage": { "url": "https://learningequality.org/static/img/no-wifi.png", "width": 184, - "height": 184 + "height": 184, }, "markings": "graph", - "labels": ["x","y"], - } + "labels": ["x", "y"], + }, } - } + }, } } - hash = '599aa896313be22dea6c0257772a464e' - + hash = "599aa896313be22dea6c0257772a464e" # setup image_files = [] test_data = sample_data_with_backgroundImage_url # SIT - testq = PerseusQuestion(id='someid', raw_data={}) + testq = PerseusQuestion(id="someid", raw_data={}) testq._recursive_url_find(test_data, image_files) # checks - new_url = test_data['question']['widgets']['interactive-graph 1']['options']['backgroundImage']['url'] - assert '☣ CONTENTSTORAGE' in new_url, 'url replacement not done' - assert hash in new_url, 'wrong url replacement' + new_url = test_data["question"]["widgets"]["interactive-graph 1"]["options"][ + "backgroundImage" + ]["url"] + assert "☣ CONTENTSTORAGE" in new_url, "url replacement not done" + assert hash in new_url, "wrong url replacement" assert len(image_files) == 1 image_file = image_files[0] filename = image_file.get_filename() - assert filename is not None, 'missing file' - assert hash in filename, 'wrong file hash' - - + assert filename is not None, "missing file" + assert hash in filename, "wrong file hash" # Test PerseusQuestion process_image_field method ################################################################################ + @pytest.fixture def persues_contentimages_field_fixtures(): """ @@ -500,39 +576,53 @@ def persues_contentimages_field_fixtures(): - `image_hashes`: content hash of image files that should get downloaded """ test_data = [ - # Known good test cases from KA English exercise - { - 'field': { 'content': 'a ![graph](web+graphie://ka-perseus-graphie.s3.amazonaws.com/eb3f3bf7c317408ee90995b5bcf4f3a59606aedd)\nb', - 'images': { 'web+graphie://ka-perseus-graphie.s3.amazonaws.com/eb3f3bf7c317408ee90995b5bcf4f3a59606aedd': {'width': 425, 'height': 425}}, - }, - 'new_content': 'a ![graph](web+graphie:${☣ CONTENTSTORAGE}/eb3f3bf7c317408ee90995b5bcf4f3a59606aedd)\nb', - 'image_hashes': ['ea2269bb5cf487f8d883144b9c06fbc7'], - }, - { - 'field': {'content': 'The function $f$\n![graph](web+graphie://ka-perseus-graphie.s3.amazonaws.com/d8daa074ec7d09ce3819d6259b3e4670701d2540)', - 'images': {'web+graphie://ka-perseus-graphie.s3.amazonaws.com/d8daa074ec7d09ce3819d6259b3e4670701d2540': {'width': 425, 'height': 425}}, - 'widgets': {} - }, - 'new_content': 'The function $f$\n![graph](web+graphie:${☣ CONTENTSTORAGE}/d8daa074ec7d09ce3819d6259b3e4670701d2540)', - 'image_hashes': ['db98ca9d35b2fb97cde378a1fabddd26'], - }, - # - # Same as above two but with missing images - { - 'field': { 'content': 'a ![graph](web+graphie://ka-perseus-graphie.s3.amazonaws.com/eb3f3bf7c317408ee90995b5bcf4f3a59606aedd)\nb', - 'images': {}, - }, - 'new_content': 'a ![graph](web+graphie:${☣ CONTENTSTORAGE}/eb3f3bf7c317408ee90995b5bcf4f3a59606aedd)\nb', - 'image_hashes': ['ea2269bb5cf487f8d883144b9c06fbc7'], - }, - { - 'field': {'content': 'The function $f$\n![graph](web+graphie://ka-perseus-graphie.s3.amazonaws.com/d8daa074ec7d09ce3819d6259b3e4670701d2540)', - 'images': {}, - 'widgets': {} - }, - 'new_content': 'The function $f$\n![graph](web+graphie:${☣ CONTENTSTORAGE}/d8daa074ec7d09ce3819d6259b3e4670701d2540)', - 'image_hashes': ['db98ca9d35b2fb97cde378a1fabddd26'], - }, + # Known good test cases from KA English exercise + { + "field": { + "content": "a ![graph](web+graphie://ka-perseus-graphie.s3.amazonaws.com/eb3f3bf7c317408ee90995b5bcf4f3a59606aedd)\nb", + "images": { + "web+graphie://ka-perseus-graphie.s3.amazonaws.com/eb3f3bf7c317408ee90995b5bcf4f3a59606aedd": { + "width": 425, + "height": 425, + } + }, + }, + "new_content": "a ![graph](web+graphie:${☣ CONTENTSTORAGE}/eb3f3bf7c317408ee90995b5bcf4f3a59606aedd)\nb", + "image_hashes": ["ea2269bb5cf487f8d883144b9c06fbc7"], + }, + { + "field": { + "content": "The function $f$\n![graph](web+graphie://ka-perseus-graphie.s3.amazonaws.com/d8daa074ec7d09ce3819d6259b3e4670701d2540)", + "images": { + "web+graphie://ka-perseus-graphie.s3.amazonaws.com/d8daa074ec7d09ce3819d6259b3e4670701d2540": { + "width": 425, + "height": 425, + } + }, + "widgets": {}, + }, + "new_content": "The function $f$\n![graph](web+graphie:${☣ CONTENTSTORAGE}/d8daa074ec7d09ce3819d6259b3e4670701d2540)", + "image_hashes": ["db98ca9d35b2fb97cde378a1fabddd26"], + }, + # + # Same as above two but with missing images + { + "field": { + "content": "a ![graph](web+graphie://ka-perseus-graphie.s3.amazonaws.com/eb3f3bf7c317408ee90995b5bcf4f3a59606aedd)\nb", + "images": {}, + }, + "new_content": "a ![graph](web+graphie:${☣ CONTENTSTORAGE}/eb3f3bf7c317408ee90995b5bcf4f3a59606aedd)\nb", + "image_hashes": ["ea2269bb5cf487f8d883144b9c06fbc7"], + }, + { + "field": { + "content": "The function $f$\n![graph](web+graphie://ka-perseus-graphie.s3.amazonaws.com/d8daa074ec7d09ce3819d6259b3e4670701d2540)", + "images": {}, + "widgets": {}, + }, + "new_content": "The function $f$\n![graph](web+graphie:${☣ CONTENTSTORAGE}/d8daa074ec7d09ce3819d6259b3e4670701d2540)", + "image_hashes": ["db98ca9d35b2fb97cde378a1fabddd26"], + }, ] return test_data @@ -553,56 +643,72 @@ def test_persues_question_process_image_field(persues_contentimages_field_fixtur for fixture in persues_contentimages_field_fixtures: # setup _clear_ricecookerfilecache() # clear file cache each time to avoid test interactions - field = fixture['field'] - expected_new_content = fixture['new_content'] - expected_image_hashes = set(fixture['image_hashes']) + field = fixture["field"] + expected_new_content = fixture["new_content"] + expected_image_hashes = set(fixture["image_hashes"]) # SIT - testq = PerseusQuestion(id='x43bbec76d5f14f88_bg', raw_data={}) - new_images, image_files = testq.process_image_field(fixture['field']) + testq = PerseusQuestion(id="x43bbec76d5f14f88_bg", raw_data={}) + new_images, image_files = testq.process_image_field(fixture["field"]) # check 1 - assert field['content'] == expected_new_content, 'Image URL replacement failed' + assert field["content"] == expected_new_content, "Image URL replacement failed" # # check 2 for image_key, image_attrs in new_images.items(): - assert 'http' not in image_key, 'Images URLs not replace with local paths' + assert "http" not in image_key, "Images URLs not replace with local paths" # # check 3 image_hashes = set() for image_file in image_files: - assert image_file is not None, 'image_file should not be None' - filehash, ext = os.path.splitext(image_file.get_filename()) + assert image_file is not None, "image_file should not be None" + filehash, ext = os.path.splitext(image_file.get_filename()) image_hashes.add(filehash) - assert image_hashes == expected_image_hashes, 'Unexpected image files set' - + assert image_hashes == expected_image_hashes, "Unexpected image files set" # Test PerseusQuestion process_question method ################################################################################ + @pytest.fixture def persues_question_json_fixtures(): """ Load entire perseus questions """ test_data = [] - with open(os.path.join(TESTCONTENT_DIR, 'exercises', 'perseus_question_x43bbec76d5f14f88_en.json'), encoding="utf-8") as inf: + with open( + os.path.join( + TESTCONTENT_DIR, "exercises", "perseus_question_x43bbec76d5f14f88_en.json" + ), + encoding="utf-8", + ) as inf: # ENGLISH JSON = KNOWN GOOD item_data_en = json.load(inf) datum = { - 'item': item_data_en, - 'image_hashes': ['ea2269bb5cf487f8d883144b9c06fbc7', 'db98ca9d35b2fb97cde378a1fabddd26'] + "item": item_data_en, + "image_hashes": [ + "ea2269bb5cf487f8d883144b9c06fbc7", + "db98ca9d35b2fb97cde378a1fabddd26", + ], } test_data.append(datum) # Missing images in the KA BULGARIAN channel BUG # see https://github.com/learningequality/ricecooker/issues/178 - with open(os.path.join(TESTCONTENT_DIR, 'exercises', 'perseus_question_x43bbec76d5f14f88_bg.json'), encoding="utf-8") as inf: + with open( + os.path.join( + TESTCONTENT_DIR, "exercises", "perseus_question_x43bbec76d5f14f88_bg.json" + ), + encoding="utf-8", + ) as inf: item_data_bg = json.load(inf) datum = { - 'item': item_data_bg, - 'image_hashes': ['ea2269bb5cf487f8d883144b9c06fbc7', 'db98ca9d35b2fb97cde378a1fabddd26'] + "item": item_data_bg, + "image_hashes": [ + "ea2269bb5cf487f8d883144b9c06fbc7", + "db98ca9d35b2fb97cde378a1fabddd26", + ], } test_data.append(datum) @@ -618,38 +724,51 @@ def test_perseus_process_question(persues_question_json_fixtures): for datum in persues_question_json_fixtures: # setup - perseus_question = datum['item'] - expected_image_hashes = set(datum['image_hashes']) + perseus_question = datum["item"] + expected_image_hashes = set(datum["image_hashes"]) _clear_ricecookerfilecache() # clear file cache each time to avoid test interactions # SIT - testq = PerseusQuestion(id='x43bbec76d5f14f88_en', raw_data=perseus_question) + testq = PerseusQuestion(id="x43bbec76d5f14f88_en", raw_data=perseus_question) filenames = testq.process_question() # check 1 - assert len(filenames) == 2, 'wrong number of filenames found' + assert len(filenames) == 2, "wrong number of filenames found" # check 2 image_hashes = set() for filename in filenames: - filehash, ext = os.path.splitext(filename) + filehash, ext = os.path.splitext(filename) image_hashes.add(filehash) - assert image_hashes == expected_image_hashes, 'Unexpected image file set' + assert image_hashes == expected_image_hashes, "Unexpected image file set" # Test exercise images ################################################################################ + def test_exercise_image_file(exercise_image_file, exercise_image_filename): filename = exercise_image_file.get_filename() - assert filename == exercise_image_filename, 'wrong filename for _ExerciseImageFile' + assert filename == exercise_image_filename, "wrong filename for _ExerciseImageFile" + -def test_exercise_base64_image_file(exercise_base64_image_file, exercise_base64_image_filename): +def test_exercise_base64_image_file( + exercise_base64_image_file, exercise_base64_image_filename +): filename = exercise_base64_image_file.get_filename() - assert filename == exercise_base64_image_filename, 'wrong filename for _ExerciseBase64ImageFile' + assert ( + filename == exercise_base64_image_filename + ), "wrong filename for _ExerciseBase64ImageFile" + -def test_exercise_graphie_filename(exercise_graphie_file, exercise_graphie_replacement_str, exercise_graphie_filename): +def test_exercise_graphie_filename( + exercise_graphie_file, exercise_graphie_replacement_str, exercise_graphie_filename +): filename = exercise_graphie_file.get_filename() - assert filename == exercise_graphie_filename, 'wrong filename for _ExerciseGraphieFile' + assert ( + filename == exercise_graphie_filename + ), "wrong filename for _ExerciseGraphieFile" replacement_str = exercise_graphie_file.get_replacement_str() - assert replacement_str == exercise_graphie_replacement_str, 'wrong replacement string for _ExerciseGraphieFile ' + assert ( + replacement_str == exercise_graphie_replacement_str + ), "wrong replacement string for _ExerciseGraphieFile " diff --git a/tests/test_files.py b/tests/test_files.py index 3b93617a..fb0a148d 100644 --- a/tests/test_files.py +++ b/tests/test_files.py @@ -1,23 +1,31 @@ """ Tests for file downloading and processing """ import os.path -import pytest -from shutil import copyfile import tempfile +from shutil import copyfile +import pytest from le_utils.constants import languages -from ricecooker.classes.files import YouTubeVideoFile -from ricecooker.classes.files import YouTubeSubtitleFile -from ricecooker.classes.files import SubtitleFile -from ricecooker.classes.files import is_youtube_subtitle_file_supported_language +from test_pdfutils import _save_file_url_to_path + +from ricecooker import config from ricecooker.classes.files import _get_language_with_alpha2_fallback +from ricecooker.classes.files import is_youtube_subtitle_file_supported_language +from ricecooker.classes.files import SubtitleFile +from ricecooker.classes.files import YouTubeSubtitleFile +from ricecooker.classes.files import YouTubeVideoFile from ricecooker.utils.zip import create_predictable_zip -from ricecooker import config - -from test_pdfutils import _save_file_url_to_path # Process all of the files -def process_files(video_file, html_file, audio_file, document_file, epub_file, thumbnail_file, subtitle_file): +def process_files( + video_file, + html_file, + audio_file, + document_file, + epub_file, + thumbnail_file, + subtitle_file, +): video_file.process_file() html_file.process_file() audio_file.process_file() @@ -28,26 +36,96 @@ def process_files(video_file, html_file, audio_file, document_file, epub_file, t """ *********** DOWNLOAD TESTS *********** """ -def test_download(video_file, html_file, audio_file, document_file, epub_file, thumbnail_file, subtitle_file): + + +def test_download( + video_file, + html_file, + audio_file, + document_file, + epub_file, + thumbnail_file, + subtitle_file, +): try: - process_files(video_file, html_file, audio_file, document_file, epub_file, thumbnail_file, subtitle_file) + process_files( + video_file, + html_file, + audio_file, + document_file, + epub_file, + thumbnail_file, + subtitle_file, + ) assert True except Exception: assert False, "One or more of the files failed to download" -def test_download_filenames(video_file, video_filename, html_file, html_filename, audio_file, audio_filename, - document_file, document_filename, epub_file, epub_filename, thumbnail_file, thumbnail_filename, subtitle_file, subtitle_filename): - assert video_file.process_file() == video_filename, "Video file should have filename {}".format(video_filename) - assert html_file.process_file() == html_filename, "HTML file should have filename {}".format(html_filename) - assert audio_file.process_file() == audio_filename, "Audio file should have filename {}".format(audio_filename) - assert document_file.process_file() == document_filename, "PDF document file should have filename {}".format(document_filename) - assert epub_file.process_file() == epub_filename, "ePub document file should have filename {}".format(epub_filename) - assert thumbnail_file.process_file() == thumbnail_filename, "Thumbnail file should have filename {}".format(thumbnail_filename) - assert subtitle_file.process_file() == subtitle_filename, "Subtitle file should have filename {}".format(subtitle_filename) - -def test_download_to_storage(video_file, video_filename, html_file, html_filename, audio_file, audio_filename, - document_file, document_filename, epub_file, epub_filename, thumbnail_file, thumbnail_filename, subtitle_file, subtitle_filename): - process_files(video_file, html_file, audio_file, document_file, epub_file, thumbnail_file, subtitle_file) + +def test_download_filenames( + video_file, + video_filename, + html_file, + html_filename, + audio_file, + audio_filename, + document_file, + document_filename, + epub_file, + epub_filename, + thumbnail_file, + thumbnail_filename, + subtitle_file, + subtitle_filename, +): + assert ( + video_file.process_file() == video_filename + ), "Video file should have filename {}".format(video_filename) + assert ( + html_file.process_file() == html_filename + ), "HTML file should have filename {}".format(html_filename) + assert ( + audio_file.process_file() == audio_filename + ), "Audio file should have filename {}".format(audio_filename) + assert ( + document_file.process_file() == document_filename + ), "PDF document file should have filename {}".format(document_filename) + assert ( + epub_file.process_file() == epub_filename + ), "ePub document file should have filename {}".format(epub_filename) + assert ( + thumbnail_file.process_file() == thumbnail_filename + ), "Thumbnail file should have filename {}".format(thumbnail_filename) + assert ( + subtitle_file.process_file() == subtitle_filename + ), "Subtitle file should have filename {}".format(subtitle_filename) + + +def test_download_to_storage( + video_file, + video_filename, + html_file, + html_filename, + audio_file, + audio_filename, + document_file, + document_filename, + epub_file, + epub_filename, + thumbnail_file, + thumbnail_filename, + subtitle_file, + subtitle_filename, +): + process_files( + video_file, + html_file, + audio_file, + document_file, + epub_file, + thumbnail_file, + subtitle_file, + ) video_path = config.get_storage_path(video_filename) html_path = config.get_storage_path(html_filename) audio_path = config.get_storage_path(audio_filename) @@ -59,64 +137,95 @@ def test_download_to_storage(video_file, video_filename, html_file, html_filenam assert os.path.isfile(video_path), "Video should be stored at {}".format(video_path) assert os.path.isfile(html_path), "HTML should be stored at {}".format(html_path) assert os.path.isfile(audio_path), "Audio should be stored at {}".format(audio_path) - assert os.path.isfile(document_path), "PDF document should be stored at {}".format(document_path) - assert os.path.isfile(epub_path), "ePub document should be stored at {}".format(epub_path) - assert os.path.isfile(thumbnail_path), "Thumbnail should be stored at {}".format(thumbnail_path) - assert os.path.isfile(subtitle_path), "Subtitle should be stored at {}".format(subtitle_path) + assert os.path.isfile(document_path), "PDF document should be stored at {}".format( + document_path + ) + assert os.path.isfile(epub_path), "ePub document should be stored at {}".format( + epub_path + ) + assert os.path.isfile(thumbnail_path), "Thumbnail should be stored at {}".format( + thumbnail_path + ) + assert os.path.isfile(subtitle_path), "Subtitle should be stored at {}".format( + subtitle_path + ) + def test_set_language(): - sub1 = SubtitleFile('path', language='en') - sub2 = SubtitleFile('path', language=languages.getlang('es')) - assert isinstance(sub1.language, str), "Subtitles must be converted to Language class" + sub1 = SubtitleFile("path", language="en") + sub2 = SubtitleFile("path", language=languages.getlang("es")) + assert isinstance( + sub1.language, str + ), "Subtitles must be converted to Language class" assert isinstance(sub2.language, str), "Subtitles can be passed as Langauge models" - assert sub1.language == 'en', "Subtitles must have a language" - assert sub2.language == 'es', "Subtitles must have a language" - pytest.raises(TypeError, SubtitleFile, 'path', language='notalanguage') + assert sub1.language == "en", "Subtitles must have a language" + assert sub2.language == "es", "Subtitles must have a language" + pytest.raises(TypeError, SubtitleFile, "path", language="notalanguage") + def test_presets(): assert True + def test_validate(): assert True + def test_to_dict(): assert True + """ *********** DOWNLOADFILE TESTS *********** """ + + def test_downloadfile_validate(): assert True + def test_downloadfile_process_file(): assert True """ *********** THUMBNAILFILE TESTS *********** """ + + def test_thumbnailfile_validate(): assert True + def test_thumbnailfile_to_dict(): assert True + def test_languages(): assert True """ *********** DOCUMENTFILE TESTS *********** """ + + def test_documentfile_validate(): assert True + def test_documentfile_to_dict(): assert True """ *********** HTMLZIPFILE TESTS *********** """ + + def test_htmlfile_validate(): assert True + def test_htmlfile_to_dict(): assert True -@pytest.mark.skip('Skipping one-off create_predictable_zip stress test because long running...') + +@pytest.mark.skip( + "Skipping one-off create_predictable_zip stress test because long running..." +) def test_create_many_predictable_zip_files(ndirs=8193): """ Regression test for `OSError: [Errno 24] Too many open files` when using @@ -128,186 +237,213 @@ def test_create_many_predictable_zip_files(ndirs=8193): zip_paths = [] for _ in range(0, ndirs): inputdir = tempfile.mkdtemp() - with open(os.path.join(inputdir,'index.html'), 'w') as testf: - testf.write('something something') + with open(os.path.join(inputdir, "index.html"), "w") as testf: + testf.write("something something") zip_path = create_predictable_zip(inputdir) zip_paths.append(zip_path) - assert len(zip_paths) == ndirs, 'wrong number of zip files created' + assert len(zip_paths) == ndirs, "wrong number of zip files created" """ *********** EXTRACTEDVIDEOTHUMBNAILFILE TESTS *********** """ + + def test_extractedvideothumbnail_process_file(): assert True + def test_extractedvideothumbnail_validate(): assert True + def test_extractedvideothumbnail_to_dict(): assert True + def test_extractedvideothumbnail_derive_thumbnail(): assert True + """ *********** VIDEOFILE TESTS *********** """ + + def test_video_validate(): assert True + def test_video_to_dict(): assert True """ *********** WEBVIDEOFILE TESTS *********** """ + + def test_webvideo_process_file(): assert True + def test_webvideo_validate(): assert True + def test_webvideo_to_dict(): assert True """ *********** YOUTUBEVIDEOFILE TESTS *********** """ + @pytest.mark.skipif(True, reason="Requires connecting to youtube.") def test_youtubevideo_process_file(youtube_video_dict): - video_file = YouTubeVideoFile(youtube_id=youtube_video_dict['youtube_id']) + video_file = YouTubeVideoFile(youtube_id=youtube_video_dict["youtube_id"]) filename = video_file.process_file() - assert filename is not None, 'Processing YouTubeVideoFile file failed' - assert filename.endswith('.mp4'), 'Wrong extenstion for video' + assert filename is not None, "Processing YouTubeVideoFile file failed" + assert filename.endswith(".mp4"), "Wrong extenstion for video" + def test_youtubevideo_validate(): assert True + def test_youtubevideo_to_dict(): assert True - """ *********** YOUTUBESUBTITLEFILE TESTS *********** """ + @pytest.fixture def subtitles_langs_internal(): - return ['en', 'es', 'pt-BR'] + return ["en", "es", "pt-BR"] + @pytest.fixture def subtitles_langs_pycountry_mappable(): - return ['zu'] + return ["zu"] + @pytest.fixture def subtitles_langs_youtube_custom(): - return ['iw', 'zh-Hans', 'pt-BR'] + return ["iw", "zh-Hans", "pt-BR"] + @pytest.fixture def subtitles_langs_ubsupported(): - return ['sgn', 'zzzza', 'ab-dab', 'bbb-qqq'] + return ["sgn", "zzzza", "ab-dab", "bbb-qqq"] -def test_is_youtube_subtitle_file_supported_language(subtitles_langs_internal, - subtitles_langs_pycountry_mappable, - subtitles_langs_youtube_custom): + +def test_is_youtube_subtitle_file_supported_language( + subtitles_langs_internal, + subtitles_langs_pycountry_mappable, + subtitles_langs_youtube_custom, +): for lang in subtitles_langs_internal: - assert is_youtube_subtitle_file_supported_language(lang), 'should be supported' + assert is_youtube_subtitle_file_supported_language(lang), "should be supported" lang_obj = _get_language_with_alpha2_fallback(lang) - assert lang_obj is not None, 'lookup should return Language object' + assert lang_obj is not None, "lookup should return Language object" for lang in subtitles_langs_pycountry_mappable: - assert is_youtube_subtitle_file_supported_language(lang), 'should be supported' + assert is_youtube_subtitle_file_supported_language(lang), "should be supported" lang_obj = _get_language_with_alpha2_fallback(lang) - assert lang_obj is not None, 'lookup should return Language object' + assert lang_obj is not None, "lookup should return Language object" for lang in subtitles_langs_youtube_custom: - assert is_youtube_subtitle_file_supported_language(lang), 'should be supported' + assert is_youtube_subtitle_file_supported_language(lang), "should be supported" lang_obj = _get_language_with_alpha2_fallback(lang) - assert lang_obj is not None, 'lookup should return Language object' + assert lang_obj is not None, "lookup should return Language object" + def test_is_youtube_subtitle_file_unsupported_language(subtitles_langs_ubsupported): for lang in subtitles_langs_ubsupported: - assert not is_youtube_subtitle_file_supported_language(lang), 'should not be supported' + assert not is_youtube_subtitle_file_supported_language( + lang + ), "should not be supported" lang_obj = _get_language_with_alpha2_fallback(lang) - assert lang_obj is None, 'lookup should fail' + assert lang_obj is None, "lookup should fail" + @pytest.mark.skipif(True, reason="Requires connecting to youtube.") def test_youtubesubtitle_process_file(youtube_video_with_subs_dict): - youtube_id = youtube_video_with_subs_dict['youtube_id'] - lang = youtube_video_with_subs_dict['subtitles_langs'][0] + youtube_id = youtube_video_with_subs_dict["youtube_id"] + lang = youtube_video_with_subs_dict["subtitles_langs"][0] sub_file = YouTubeSubtitleFile(youtube_id=youtube_id, language=lang) filename = sub_file.process_file() - assert filename is not None, 'Processing YouTubeSubtitleFile file failed' - assert filename.endswith('.vtt'), 'Wrong extenstion for video subtitles' - assert not filename.endswith('.' + lang + '.vtt'), 'Lang code in extension' + assert filename is not None, "Processing YouTubeSubtitleFile file failed" + assert filename.endswith(".vtt"), "Wrong extenstion for video subtitles" + assert not filename.endswith("." + lang + ".vtt"), "Lang code in extension" + def test_youtubesubtitle_validate(): assert True + def test_youtubesubtitle_to_dict(): assert True - - """ *********** SUBTITLEFILE TESTS *********** """ + def test_convertible_substitles_ar_srt(): """ Basic check that srt --> vtt conversion works. """ local_path = os.path.join("tests", "testcontent", "samples", "testsubtitles_ar.srt") assert os.path.exists(local_path) - subtitle_file = SubtitleFile(local_path, language='ar') + subtitle_file = SubtitleFile(local_path, language="ar") filename = subtitle_file.process_file() - assert filename, 'converted filename must exist' - assert filename.endswith('.vtt'), 'converted filename must have .vtt extension' + assert filename, "converted filename must exist" + assert filename.endswith(".vtt"), "converted filename must have .vtt extension" storage_path = config.get_storage_path(filename) with open(storage_path, encoding="utf-8") as converted_vtt: filecontents = converted_vtt.read() - check_words = 'لناس على' - assert check_words in filecontents, 'missing check word in converted subs' + check_words = "لناس على" + assert check_words in filecontents, "missing check word in converted subs" @pytest.fixture def bad_subtitles_file(): local_path = os.path.join("tests", "testcontent", "generated", "unconvetible.sub") if not os.path.exists(local_path): - with open(local_path, 'wb') as f: - f.write(b'this is an invalid subtitle file that cant be converted.') + with open(local_path, "wb") as f: + f.write(b"this is an invalid subtitle file that cant be converted.") f.flush() else: - f = open(local_path, 'rb') + f = open(local_path, "rb") f.close() return f # returns a closed file descriptor which we use for name attribute def test_bad_subtitles_raises(bad_subtitles_file): - subs_file = SubtitleFile(bad_subtitles_file.name, language='en') + subs_file = SubtitleFile(bad_subtitles_file.name, language="en") pytest.raises(ValueError, subs_file.process_file) - - PRESSURECOOKER_REPO_URL = "https://raw.githubusercontent.com/bjester/pressurecooker/" -PRESSURECOOKER_FILES_URL_BASE = PRESSURECOOKER_REPO_URL + "pycaption/tests/files/subtitles/" +PRESSURECOOKER_FILES_URL_BASE = ( + PRESSURECOOKER_REPO_URL + "pycaption/tests/files/subtitles/" +) PRESSURECOOKER_SUBS_FIXTURES = [ { - 'srcfilename': 'basic.srt', - 'subtitlesformat': 'srt', - 'language': languages.getlang('ar'), - 'check_words': 'البعض أكثر' + "srcfilename": "basic.srt", + "subtitlesformat": "srt", + "language": languages.getlang("ar"), + "check_words": "البعض أكثر", }, { - 'srcfilename': 'encapsulated.sami', - 'subtitlesformat': 'sami', - 'language': 'en', - 'check_words': 'we have this vision of Einstein', + "srcfilename": "encapsulated.sami", + "subtitlesformat": "sami", + "language": "en", + "check_words": "we have this vision of Einstein", }, { - 'srcfilename': 'basic.vtt', - 'subtitlesformat': 'vtt', - 'language': 'ar', - 'check_words': 'البعض أكثر' + "srcfilename": "basic.vtt", + "subtitlesformat": "vtt", + "language": "ar", + "check_words": "البعض أكثر", }, { - 'srcfilename': 'encapsulated.vtt', - 'subtitlesformat': 'vtt', - 'language': 'en', - 'check_words': 'we have this vision of Einstein' + "srcfilename": "encapsulated.vtt", + "subtitlesformat": "vtt", + "language": "en", + "check_words": "we have this vision of Einstein", }, ] @@ -318,15 +454,20 @@ def download_fixture_files(fixtures_list): """ fixtures = [] for fixture in fixtures_list: - srcfilename = fixture['srcfilename'] - localpath = os.path.join('tests', 'testcontent', 'downloaded', srcfilename) + srcfilename = fixture["srcfilename"] + localpath = os.path.join("tests", "testcontent", "downloaded", srcfilename) if not os.path.exists(localpath): - url = fixture['url'] if 'url' in fixture.keys() \ + url = ( + fixture["url"] + if "url" in fixture.keys() else PRESSURECOOKER_FILES_URL_BASE + srcfilename + ) print(url) _save_file_url_to_path(url, localpath) - assert os.path.exists(localpath), 'Error mising local test file ' + localpath - fixture['localpath'] = localpath + assert os.path.exists(localpath), ( + "Error mising local test file " + localpath + ) + fixture["localpath"] = localpath fixtures.append(fixture) return fixtures @@ -338,17 +479,20 @@ def pressurcooker_test_files(): """ return download_fixture_files(PRESSURECOOKER_SUBS_FIXTURES) + @pytest.fixture def youtube_test_file(): - return download_fixture_files([ - { - 'srcfilename': 'testsubtitles_ar.ttml', - 'subtitlesformat': 'ttml', - 'language': 'ar', - 'check_words': 'Mohammed Liyaudheen wafy', - 'url': 'https://www.youtube.com/api/timedtext?lang=ar&v=C_9f7Qq4YZc&fmt=ttml&name=' - }, - ]) + return download_fixture_files( + [ + { + "srcfilename": "testsubtitles_ar.ttml", + "subtitlesformat": "ttml", + "language": "ar", + "check_words": "Mohammed Liyaudheen wafy", + "url": "https://www.youtube.com/api/timedtext?lang=ar&v=C_9f7Qq4YZc&fmt=ttml&name=", + }, + ] + ) def test_convertible_substitles_from_pressurcooker(pressurcooker_test_files): @@ -357,47 +501,53 @@ def test_convertible_substitles_from_pressurcooker(pressurcooker_test_files): All subs have the appropriate extension so no need to specify `subtitlesformat`. """ for fixture in pressurcooker_test_files: - localpath = fixture['localpath'] - assert os.path.exists(localpath), 'Error mising local test file ' + localpath - subtitle_file = SubtitleFile(localpath, language=fixture['language']) + localpath = fixture["localpath"] + assert os.path.exists(localpath), "Error mising local test file " + localpath + subtitle_file = SubtitleFile(localpath, language=fixture["language"]) filename = subtitle_file.process_file() - assert filename, 'conferted filename must exit' - assert filename.endswith('.vtt'), 'conferted filename must have .vtt extension' + assert filename, "conferted filename must exit" + assert filename.endswith(".vtt"), "conferted filename must have .vtt extension" storage_path = config.get_storage_path(filename) with open(storage_path, encoding="utf-8") as converted_vtt: filecontents = converted_vtt.read() - assert fixture['check_words'] in filecontents, 'missing check_words in converted subs' + assert ( + fixture["check_words"] in filecontents + ), "missing check_words in converted subs" def test_convertible_substitles_ar_ttml(youtube_test_file): """ Regression test to make sure correct lang_code is detected from .ttml data. """ - local_path = os.path.join("tests", "testcontent", "downloaded", "testsubtitles_ar.ttml") + local_path = os.path.join( + "tests", "testcontent", "downloaded", "testsubtitles_ar.ttml" + ) assert os.path.exists(local_path) - subtitle_file = SubtitleFile(local_path, language='ar') + subtitle_file = SubtitleFile(local_path, language="ar") filename = subtitle_file.process_file() - assert filename, 'conferted filename must exit' - assert filename.endswith('.vtt'), 'conferted filename must have .vtt extension' + assert filename, "conferted filename must exit" + assert filename.endswith(".vtt"), "conferted filename must have .vtt extension" def test_convertible_substitles_noext_subtitlesformat(): """ Check that we handle correctly cases when path doesn't contain extenstion. """ - local_path = os.path.join("tests", "testcontent", "downloaded", "testsubtitles_ar.ttml") + local_path = os.path.join( + "tests", "testcontent", "downloaded", "testsubtitles_ar.ttml" + ) assert os.path.exists(local_path) - local_path_no_ext = local_path.replace('.ttml', '') + local_path_no_ext = local_path.replace(".ttml", "") copyfile(local_path, local_path_no_ext) assert os.path.exists(local_path_no_ext) subtitle_file = SubtitleFile( local_path_no_ext, - language='ar', - subtitlesformat='ttml' # settting subtitlesformat becaue no ext + language="ar", + subtitlesformat="ttml", # settting subtitlesformat becaue no ext ) filename = subtitle_file.process_file() - assert filename, 'conferted filename must exit' - assert filename.endswith('.vtt'), 'conferted filename must have .vtt extension' + assert filename, "conferted filename must exit" + assert filename.endswith(".vtt"), "conferted filename must have .vtt extension" def test_convertible_substitles_weirdext_subtitlesformat(): @@ -405,18 +555,21 @@ def test_convertible_substitles_weirdext_subtitlesformat(): Check that we handle cases when ext cannot be guessed from URL or localpath. Passing `subtitlesformat` allows chef authors to manually specify subs format. """ - subs_url = 'https://commons.wikimedia.org/w/api.php?' \ - + 'action=timedtext&title=File%3AA_Is_for_Atom_1953.webm&lang=es&trackformat=srt' + subs_url = ( + "https://commons.wikimedia.org/w/api.php?" + + "action=timedtext&title=File%3AA_Is_for_Atom_1953.webm&lang=es&trackformat=srt" + ) subtitle_file = SubtitleFile( subs_url, - language='es', - subtitlesformat='srt' # set subtitlesformat when can't inferr ext form url + language="es", + subtitlesformat="srt", # set subtitlesformat when can't inferr ext form url ) filename = subtitle_file.process_file() - assert filename, 'conferted filename must exit' - assert filename.endswith('.vtt'), 'conferted filename must have .vtt extension' + assert filename, "conferted filename must exit" + assert filename.endswith(".vtt"), "conferted filename must have .vtt extension" storage_path = config.get_storage_path(filename) with open(storage_path, encoding="utf-8") as converted_vtt: filecontents = converted_vtt.read() - assert 'El total de los protones y neutrones de un átomo' in filecontents, \ - 'missing check words in converted subs' + assert ( + "El total de los protones y neutrones de un átomo" in filecontents + ), "missing check words in converted subs" diff --git a/tests/test_licenses.py b/tests/test_licenses.py index f7f487d8..9fb4134c 100644 --- a/tests/test_licenses.py +++ b/tests/test_licenses.py @@ -1,61 +1,74 @@ """ Tests for license getting and serialization """ - import json + import pytest +from le_utils.constants.licenses import ALL_RIGHTS_RESERVED +from le_utils.constants.licenses import CC_BY +from le_utils.constants.licenses import CC_BY_NC +from le_utils.constants.licenses import CC_BY_NC_ND +from le_utils.constants.licenses import CC_BY_NC_SA +from le_utils.constants.licenses import CC_BY_ND +from le_utils.constants.licenses import CC_BY_SA +from le_utils.constants.licenses import PUBLIC_DOMAIN +from le_utils.constants.licenses import SPECIAL_PERMISSIONS -from le_utils.constants.licenses import ( - CC_BY, CC_BY_SA, CC_BY_ND, CC_BY_NC, CC_BY_NC_SA, CC_BY_NC_ND, - ALL_RIGHTS_RESERVED, - PUBLIC_DOMAIN, - SPECIAL_PERMISSIONS -) from ricecooker.classes.licenses import get_license - """ *********** LICENSE FIXTURES *********** """ + + @pytest.fixture def license_objects(): - regular_ids = [CC_BY, CC_BY_SA, CC_BY_ND, CC_BY_NC, CC_BY_NC_SA, CC_BY_NC_ND, - ALL_RIGHTS_RESERVED, PUBLIC_DOMAIN] + regular_ids = [ + CC_BY, + CC_BY_SA, + CC_BY_ND, + CC_BY_NC, + CC_BY_NC_SA, + CC_BY_NC_ND, + ALL_RIGHTS_RESERVED, + PUBLIC_DOMAIN, + ] license_objects = [] for regular_id in regular_ids: # with desciption and copyright_holder - licence_obj = get_license(regular_id, - copyright_holder='Some name', - description='Le description') - assert licence_obj, 'licence_obj should exist' + licence_obj = get_license( + regular_id, copyright_holder="Some name", description="Le description" + ) + assert licence_obj, "licence_obj should exist" license_objects.append(licence_obj) # with desciption only - licence_obj = get_license(regular_id, description='Le description solo2') - assert licence_obj, 'licence_obj should exist' + licence_obj = get_license(regular_id, description="Le description solo2") + assert licence_obj, "licence_obj should exist" license_objects.append(licence_obj) # with copyright_holder only - licence_obj = get_license(regular_id, copyright_holder='Some name3') - assert licence_obj, 'licence_obj should exist' + licence_obj = get_license(regular_id, copyright_holder="Some name3") + assert licence_obj, "licence_obj should exist" license_objects.append(licence_obj) # bare licence_obj = get_license(regular_id) - assert licence_obj, 'licence_obj should exist' + assert licence_obj, "licence_obj should exist" license_objects.append(licence_obj) return license_objects + @pytest.fixture def special_license(): - return get_license(SPECIAL_PERMISSIONS, - copyright_holder='Authorov', - description='Only for use offline') - - - + return get_license( + SPECIAL_PERMISSIONS, + copyright_holder="Authorov", + description="Only for use offline", + ) """ *********** LICENSE TESTS *********** """ + def test_the_license_fixtures(license_objects, special_license): assert len(license_objects) > 4 assert special_license.license_id == SPECIAL_PERMISSIONS @@ -65,9 +78,9 @@ def test_the_license_fixtures(license_objects, special_license): def test_bad_special_license(): try: get_license(SPECIAL_PERMISSIONS, description=None) - assert False, 'Should not come here because of missing description' + assert False, "Should not come here because of missing description" except AssertionError: - assert True, 'SPECIAL_PERMISSIONS without description should raise an exception' + assert True, "SPECIAL_PERMISSIONS without description should raise an exception" def _compare_licence_objects(obj1, obj2): @@ -93,5 +106,4 @@ def test_license_serilizibility(license_objects, special_license): license_copy = get_license(**license_copy_dict) same_attributes = _compare_licence_objects(licence_orig, license_copy) - assert same_attributes, 'License attributes not the same after serizlize' - + assert same_attributes, "License attributes not the same after serizlize" diff --git a/tests/test_links.py b/tests/test_links.py index 62f00455..cfc7df38 100644 --- a/tests/test_links.py +++ b/tests/test_links.py @@ -13,8 +13,8 @@ def test_replace_absolute_links(): img_srcset_content = '' urls_to_replace = { - 'http://replace.me/img/hello.jpg': 'img/hello.jpg', - 'http://replace.me/link/to/page.html': 'link/to/page.html' + "http://replace.me/img/hello.jpg": "img/hello.jpg", + "http://replace.me/link/to/page.html": "link/to/page.html", } output = replace_links(img_content, urls_to_replace) @@ -43,23 +43,53 @@ def test_replace_relative_links(): img_srcset_content = '' urls_to_replace = { - 'http://replace.me/img/hello.jpg': 'replace.me/img/hello.jpg', - 'http://replace.me/link/to/page.html': 'replace.me/link/to/page.html' + "http://replace.me/img/hello.jpg": "replace.me/img/hello.jpg", + "http://replace.me/link/to/page.html": "replace.me/link/to/page.html", } - content_dir = os.path.join('replace.me', 'link', 'from') - download_root = '.' - - output = replace_links(img_content, urls_to_replace, download_root=download_root, content_dir=content_dir, relative_links=True) + content_dir = os.path.join("replace.me", "link", "from") + download_root = "." + + output = replace_links( + img_content, + urls_to_replace, + download_root=download_root, + content_dir=content_dir, + relative_links=True, + ) assert output == '' - output = replace_links(a_content, urls_to_replace, download_root=download_root, content_dir=content_dir, relative_links=True) + output = replace_links( + a_content, + urls_to_replace, + download_root=download_root, + content_dir=content_dir, + relative_links=True, + ) assert output == '' - output = replace_links(noscheme_a_content, urls_to_replace, download_root=download_root, content_dir=content_dir, relative_links=True) + output = replace_links( + noscheme_a_content, + urls_to_replace, + download_root=download_root, + content_dir=content_dir, + relative_links=True, + ) assert output == '' - output = replace_links(root_a_content, urls_to_replace, download_root=download_root, content_dir=content_dir, relative_links=True) + output = replace_links( + root_a_content, + urls_to_replace, + download_root=download_root, + content_dir=content_dir, + relative_links=True, + ) assert output == '' - output = replace_links(img_srcset_content, urls_to_replace, download_root=download_root, content_dir=content_dir, relative_links=True) + output = replace_links( + img_srcset_content, + urls_to_replace, + download_root=download_root, + content_dir=content_dir, + relative_links=True, + ) assert output == '' diff --git a/tests/test_pdfutils.py b/tests/test_pdfutils.py index ca048714..f79c5ecd 100644 --- a/tests/test_pdfutils.py +++ b/tests/test_pdfutils.py @@ -1,51 +1,65 @@ import copy import os -from pprint import pprint -import pytest import re -import requests +from pprint import pprint from tempfile import TemporaryDirectory +import pytest +import requests from PyPDF2 import PdfFileReader -from ricecooker.classes import nodes, files - -from ricecooker.utils.pdf import PDFParser # SIT +from ricecooker.classes import files +from ricecooker.classes import nodes +from ricecooker.utils.pdf import PDFParser # SIT # Fixtures ################################################################################ + @pytest.fixture def downloads_dir(): with TemporaryDirectory() as temp_dir: yield temp_dir + @pytest.fixture def doc1_with_toc_path(): - doc1_with_toc_path = os.path.join('tests', 'testcontent', 'samples', 'sample_doc_with_toc.pdf') - assert os.path.exists(doc1_with_toc_path), 'Error mising test file ' + doc1_with_toc_path + doc1_with_toc_path = os.path.join( + "tests", "testcontent", "samples", "sample_doc_with_toc.pdf" + ) + assert os.path.exists(doc1_with_toc_path), ( + "Error mising test file " + doc1_with_toc_path + ) return doc1_with_toc_path + def _save_file_url_to_path(url, path): if not os.path.exists(path): - with open(path, 'wb') as f: + with open(path, "wb") as f: resp = requests.get(url, stream=True) for chunk in resp.iter_content(chunk_size=1048576): f.write(chunk) f.flush() - assert os.path.exists(path), 'Error mising test file ' + path + assert os.path.exists(path), "Error mising test file " + path + @pytest.fixture def doc2_with_toc_path(): """ A PDF with lots of chapters. """ - doc2_with_toc_path = os.path.join('tests', 'testcontent', 'downloaded', 'Beyond-Good-and-Evil-Galbraithcolor.pdf') - _save_file_url_to_path('https://s3-us-west-2.amazonaws.com/pressbooks-samplefiles/' - 'GalbraithColorTheme/Beyond-Good-and-Evil-Galbraithcolor.pdf', - doc2_with_toc_path) - assert os.path.exists(doc2_with_toc_path), 'Error mising test file ' + doc2_with_toc_path + doc2_with_toc_path = os.path.join( + "tests", "testcontent", "downloaded", "Beyond-Good-and-Evil-Galbraithcolor.pdf" + ) + _save_file_url_to_path( + "https://s3-us-west-2.amazonaws.com/pressbooks-samplefiles/" + "GalbraithColorTheme/Beyond-Good-and-Evil-Galbraithcolor.pdf", + doc2_with_toc_path, + ) + assert os.path.exists(doc2_with_toc_path), ( + "Error mising test file " + doc2_with_toc_path + ) return doc2_with_toc_path @@ -54,41 +68,49 @@ def doc3_with_toc_path(): """ A Gutenberg textbook PDF with a chapter-subchapter structure. """ - doc3_with_toc_path = os.path.join('tests', 'testcontent', 'downloaded', '41568-pdf.pdf') - _save_file_url_to_path('https://www.gutenberg.org/files/41568/41568-pdf.pdf', - doc3_with_toc_path) - assert os.path.exists(doc3_with_toc_path), 'Error mising test file ' + doc3_with_toc_path + doc3_with_toc_path = os.path.join( + "tests", "testcontent", "downloaded", "41568-pdf.pdf" + ) + _save_file_url_to_path( + "https://www.gutenberg.org/files/41568/41568-pdf.pdf", doc3_with_toc_path + ) + assert os.path.exists(doc3_with_toc_path), ( + "Error mising test file " + doc3_with_toc_path + ) return doc3_with_toc_path # Chapters only ################################################################################ + def test_get_toc(doc1_with_toc_path, downloads_dir): - with PDFParser(doc1_with_toc_path, directory=downloads_dir) as pdfparser: + with PDFParser(doc1_with_toc_path, directory=downloads_dir) as pdfparser: chapters_toc = pdfparser.get_toc() for chapter_dict in chapters_toc: _check_pagerange_matches_title_len(chapter_dict) + def test_split_chapters(doc1_with_toc_path, downloads_dir): with PDFParser(doc1_with_toc_path, directory=downloads_dir) as pdfparser: chapters = pdfparser.split_chapters() # pprint(chapters) for chapter in chapters: - chapter_path = chapter['path'] - assert chapter_path.endswith('.pdf'), 'wrong extension -- expected .pdf' - assert os.path.exists(chapter_path), 'missing split PDF file' + chapter_path = chapter["path"] + assert chapter_path.endswith(".pdf"), "wrong extension -- expected .pdf" + assert os.path.exists(chapter_path), "missing split PDF file" _check_path_matches_title_len(chapter) + def test_split_chapters_alt(doc1_with_toc_path, downloads_dir): with PDFParser(doc1_with_toc_path, directory=downloads_dir) as pdfparser: chapters_toc = pdfparser.get_toc() chapters = pdfparser.split_chapters(jsondata=chapters_toc) # pprint(chapters) for chapter in chapters: - chapter_path = chapter['path'] - assert chapter_path.endswith('.pdf'), 'wrong extension -- expected .pdf' - assert os.path.exists(chapter_path), 'missing split PDF file' + chapter_path = chapter["path"] + assert chapter_path.endswith(".pdf"), "wrong extension -- expected .pdf" + assert os.path.exists(chapter_path), "missing split PDF file" _check_path_matches_title_len(chapter) @@ -97,23 +119,41 @@ def test_split_chapters2(doc2_with_toc_path, downloads_dir): chapters = pdfparser.split_chapters() # pprint(chapters) for chapter in chapters: - chapter_path = chapter['path'] - assert chapter_path.endswith('.pdf'), 'wrong extension -- expected .pdf' - assert os.path.exists(chapter_path), 'missing split PDF file' + chapter_path = chapter["path"] + assert chapter_path.endswith(".pdf"), "wrong extension -- expected .pdf" + assert os.path.exists(chapter_path), "missing split PDF file" # - assert _get_pdf_len(chapters[0]) == 2, 'wrong length for ch ' + str(chapters[0]) - assert _get_pdf_len(chapters[1]) == 2, 'wrong length for ch ' + str(chapters[1]) - assert _get_pdf_len(chapters[2]) == 4, 'wrong length for ch ' + str(chapters[2]) - assert _get_pdf_len(chapters[3]) == 21, 'wrong length for ch ' + str(chapters[3]) - assert _get_pdf_len(chapters[4]) == 19, 'wrong length for ch ' + str(chapters[4]) - assert _get_pdf_len(chapters[5]) == 16, 'wrong length for ch ' + str(chapters[5]) - assert _get_pdf_len(chapters[6]) == 9, 'wrong length for ch ' + str(chapters[6]) - assert _get_pdf_len(chapters[7]) == 21, 'wrong length for ch ' + str(chapters[7]) - assert _get_pdf_len(chapters[8]) == 18, 'wrong length for ch ' + str(chapters[8]) - assert _get_pdf_len(chapters[9]) == 23, 'wrong length for ch ' + str(chapters[9]) - assert _get_pdf_len(chapters[10]) == 23, 'wrong length for ch ' + str(chapters[10]) - assert _get_pdf_len(chapters[11]) == 30, 'wrong length for ch ' + str(chapters[11]) - assert _get_pdf_len(chapters[12]) == 4, 'wrong length for ch ' + str(chapters[12]) + assert _get_pdf_len(chapters[0]) == 2, "wrong length for ch " + str(chapters[0]) + assert _get_pdf_len(chapters[1]) == 2, "wrong length for ch " + str(chapters[1]) + assert _get_pdf_len(chapters[2]) == 4, "wrong length for ch " + str(chapters[2]) + assert _get_pdf_len(chapters[3]) == 21, "wrong length for ch " + str( + chapters[3] + ) + assert _get_pdf_len(chapters[4]) == 19, "wrong length for ch " + str( + chapters[4] + ) + assert _get_pdf_len(chapters[5]) == 16, "wrong length for ch " + str( + chapters[5] + ) + assert _get_pdf_len(chapters[6]) == 9, "wrong length for ch " + str(chapters[6]) + assert _get_pdf_len(chapters[7]) == 21, "wrong length for ch " + str( + chapters[7] + ) + assert _get_pdf_len(chapters[8]) == 18, "wrong length for ch " + str( + chapters[8] + ) + assert _get_pdf_len(chapters[9]) == 23, "wrong length for ch " + str( + chapters[9] + ) + assert _get_pdf_len(chapters[10]) == 23, "wrong length for ch " + str( + chapters[10] + ) + assert _get_pdf_len(chapters[11]) == 30, "wrong length for ch " + str( + chapters[11] + ) + assert _get_pdf_len(chapters[12]) == 4, "wrong length for ch " + str( + chapters[12] + ) def test_split_chapters3(doc3_with_toc_path, downloads_dir): @@ -122,27 +162,40 @@ def test_split_chapters3(doc3_with_toc_path, downloads_dir): chapters = pdfparser.split_chapters() # pprint(chapters) for chapter in chapters: - chapter_path = chapter['path'] - assert chapter_path.endswith('.pdf'), 'wrong extension -- expected .pdf' - assert os.path.exists(chapter_path), 'missing split PDF file' - assert _get_pdf_len(chapters[0]) == 1, 'wrong length for ch ' + str(chapters[0]) - assert _get_pdf_len(chapters[1]) == 1, 'wrong length for ch ' + str(chapters[1]) - assert _get_pdf_len(chapters[2]) == 2, 'wrong length for ch ' + str(chapters[2]) - assert _get_pdf_len(chapters[3]) == 206, 'wrong length for ch ' + str(chapters[3]) - assert _get_pdf_len(chapters[4]) == 9, 'wrong length for ch ' + str(chapters[4]) - assert _get_pdf_len(chapters[5]) == 9, 'wrong length for ch ' + str(chapters[5]) + chapter_path = chapter["path"] + assert chapter_path.endswith(".pdf"), "wrong extension -- expected .pdf" + assert os.path.exists(chapter_path), "missing split PDF file" + assert _get_pdf_len(chapters[0]) == 1, "wrong length for ch " + str( + chapters[0] + ) + assert _get_pdf_len(chapters[1]) == 1, "wrong length for ch " + str( + chapters[1] + ) + assert _get_pdf_len(chapters[2]) == 2, "wrong length for ch " + str( + chapters[2] + ) + assert _get_pdf_len(chapters[3]) == 206, "wrong length for ch " + str( + chapters[3] + ) + assert _get_pdf_len(chapters[4]) == 9, "wrong length for ch " + str( + chapters[4] + ) + assert _get_pdf_len(chapters[5]) == 9, "wrong length for ch " + str( + chapters[5] + ) # print('assert _get_pdf_len(chapters[]) ==', str(_get_pdf_len(chapter))+', \'wrong length for ch \' + str(chapters[])') # Chapters and subchapters ################################################################################ + def test_get_toc_subchapters(doc1_with_toc_path, downloads_dir): with PDFParser(doc1_with_toc_path, directory=downloads_dir) as pdfparser: chapters_toc = pdfparser.get_toc(subchapters=True) for chapter_dict in chapters_toc: - if 'children' in chapter_dict and chapter_dict['children']: - for subchapter_dict in chapter_dict['children']: + if "children" in chapter_dict and chapter_dict["children"]: + for subchapter_dict in chapter_dict["children"]: _check_pagerange_matches_title_len(subchapter_dict) else: _check_pagerange_matches_title_len(chapter_dict) @@ -153,82 +206,107 @@ def test_split_subchapters(doc1_with_toc_path, downloads_dir): chapters = pdfparser.split_subchapters() # pprint(chapters) for ch in chapters[0:4]: - assert 'children' not in ch, 'first four chapters have no subchapters...' - assert _get_pdf_len(chapters[0]) == 1, 'wrong num pages in ' + str(chapters[0]) - assert _get_pdf_len(chapters[1]) == 1, 'wrong num pages in ' + str(chapters[1]) - assert _get_pdf_len(chapters[2]) == 2, 'wrong num pages in ' + str(chapters[2]) - assert _get_pdf_len(chapters[3]) == 3, 'wrong num pages in ' + str(chapters[3]) + assert "children" not in ch, "first four chapters have no subchapters..." + assert _get_pdf_len(chapters[0]) == 1, "wrong num pages in " + str(chapters[0]) + assert _get_pdf_len(chapters[1]) == 1, "wrong num pages in " + str(chapters[1]) + assert _get_pdf_len(chapters[2]) == 2, "wrong num pages in " + str(chapters[2]) + assert _get_pdf_len(chapters[3]) == 3, "wrong num pages in " + str(chapters[3]) ch4 = chapters[4] - assert 'children' in ch4, 'no children' - assert len(ch4['children']) == 2 - assert _get_pdf_len(ch4['children'][0]) == 1, 'wrong num pages in ' + str(ch4['children'][0]) - assert _get_pdf_len(ch4['children'][1]) == 1, 'wrong num pages in ' + str(ch4['children'][1]) + assert "children" in ch4, "no children" + assert len(ch4["children"]) == 2 + assert _get_pdf_len(ch4["children"][0]) == 1, "wrong num pages in " + str( + ch4["children"][0] + ) + assert _get_pdf_len(ch4["children"][1]) == 1, "wrong num pages in " + str( + ch4["children"][1] + ) ch5 = chapters[5] - assert 'children' in ch5, 'no children' - assert len(ch5['children']) == 3 - assert _get_pdf_len(ch5['children'][0]) == 1, 'wrong num pages in ' + str(ch5['children'][0]) - assert _get_pdf_len(ch5['children'][1]) == 1, 'wrong num pages in ' + str(ch5['children'][1]) - assert _get_pdf_len(ch5['children'][2]) == 1, 'wrong num pages in ' + str(ch5['children'][2]) - - + assert "children" in ch5, "no children" + assert len(ch5["children"]) == 3 + assert _get_pdf_len(ch5["children"][0]) == 1, "wrong num pages in " + str( + ch5["children"][0] + ) + assert _get_pdf_len(ch5["children"][1]) == 1, "wrong num pages in " + str( + ch5["children"][1] + ) + assert _get_pdf_len(ch5["children"][2]) == 1, "wrong num pages in " + str( + ch5["children"][2] + ) def test_split_subchapters3(doc3_with_toc_path, downloads_dir): with PDFParser(doc3_with_toc_path, directory=downloads_dir) as pdfparser: chapters = pdfparser.split_subchapters() ch3 = chapters[3] - assert 'children' in ch3, 'no subchapters found in ch3' - assert len(ch3['children']) == 17, 'wrong number of subchapters' - subchs = ch3['children'] - assert _get_pdf_len(subchs[0]) == 6, 'wrong length for subch ' + str(subchs[0]) - assert _get_pdf_len(subchs[1]) == 8, 'wrong length for subch ' + str(subchs[1]) - assert _get_pdf_len(subchs[2]) == 14, 'wrong length for subch ' + str(subchs[2]) - assert _get_pdf_len(subchs[3]) == 14, 'wrong length for subch ' + str(subchs[3]) - assert _get_pdf_len(subchs[4]) == 11, 'wrong length for subch ' + str(subchs[4]) - assert _get_pdf_len(subchs[5]) == 13, 'wrong length for subch ' + str(subchs[5]) - assert _get_pdf_len(subchs[6]) == 13, 'wrong length for subch ' + str(subchs[6]) - assert _get_pdf_len(subchs[7]) == 10, 'wrong length for subch ' + str(subchs[7]) - assert _get_pdf_len(subchs[8]) == 13, 'wrong length for subch ' + str(subchs[8]) - assert _get_pdf_len(subchs[9]) == 15, 'wrong length for subch ' + str(subchs[9]) - assert _get_pdf_len(subchs[10]) == 16, 'wrong length for subch ' + str(subchs[10]) - assert _get_pdf_len(subchs[11]) == 7, 'wrong length for subch ' + str(subchs[11]) - assert _get_pdf_len(subchs[12]) == 18, 'wrong length for subch ' + str(subchs[12]) - assert _get_pdf_len(subchs[13]) == 20, 'wrong length for subch ' + str(subchs[13]) - assert _get_pdf_len(subchs[14]) == 15, 'wrong length for subch ' + str(subchs[14]) - assert _get_pdf_len(subchs[15]) == 8, 'wrong length for subch ' + str(subchs[15]) - assert _get_pdf_len(subchs[16]) == 5, 'wrong length for subch ' + str(subchs[16]) + assert "children" in ch3, "no subchapters found in ch3" + assert len(ch3["children"]) == 17, "wrong number of subchapters" + subchs = ch3["children"] + assert _get_pdf_len(subchs[0]) == 6, "wrong length for subch " + str(subchs[0]) + assert _get_pdf_len(subchs[1]) == 8, "wrong length for subch " + str(subchs[1]) + assert _get_pdf_len(subchs[2]) == 14, "wrong length for subch " + str(subchs[2]) + assert _get_pdf_len(subchs[3]) == 14, "wrong length for subch " + str(subchs[3]) + assert _get_pdf_len(subchs[4]) == 11, "wrong length for subch " + str(subchs[4]) + assert _get_pdf_len(subchs[5]) == 13, "wrong length for subch " + str(subchs[5]) + assert _get_pdf_len(subchs[6]) == 13, "wrong length for subch " + str(subchs[6]) + assert _get_pdf_len(subchs[7]) == 10, "wrong length for subch " + str(subchs[7]) + assert _get_pdf_len(subchs[8]) == 13, "wrong length for subch " + str(subchs[8]) + assert _get_pdf_len(subchs[9]) == 15, "wrong length for subch " + str(subchs[9]) + assert _get_pdf_len(subchs[10]) == 16, "wrong length for subch " + str( + subchs[10] + ) + assert _get_pdf_len(subchs[11]) == 7, "wrong length for subch " + str( + subchs[11] + ) + assert _get_pdf_len(subchs[12]) == 18, "wrong length for subch " + str( + subchs[12] + ) + assert _get_pdf_len(subchs[13]) == 20, "wrong length for subch " + str( + subchs[13] + ) + assert _get_pdf_len(subchs[14]) == 15, "wrong length for subch " + str( + subchs[14] + ) + assert _get_pdf_len(subchs[15]) == 8, "wrong length for subch " + str( + subchs[15] + ) + assert _get_pdf_len(subchs[16]) == 5, "wrong length for subch " + str( + subchs[16] + ) # Test helpers ################################################################################ + def _get_pdf_len(str_or_dict_with_path_attr): if isinstance(str_or_dict_with_path_attr, str): path = str_or_dict_with_path_attr else: - path = str_or_dict_with_path_attr['path'] - with open(path, 'rb') as pdffile: + path = str_or_dict_with_path_attr["path"] + with open(path, "rb") as pdffile: pdf = PdfFileReader(pdffile) return pdf.numPages def _check_pagerange_matches_title_len(pagerange): # print(chapter_dict) - title = pagerange['title'] - m = re.search(r'\(len=(?P\d*)\)', title) - assert m, 'no len=?? found in title' - len_expected = int(m.groupdict()['len']) - len_observed = pagerange['page_end'] - pagerange['page_start'] - assert len_observed == len_expected, 'Wrong page_range len detected in ' + str(pagerange) + title = pagerange["title"] + m = re.search(r"\(len=(?P\d*)\)", title) + assert m, "no len=?? found in title" + len_expected = int(m.groupdict()["len"]) + len_observed = pagerange["page_end"] - pagerange["page_start"] + assert len_observed == len_expected, "Wrong page_range len detected in " + str( + pagerange + ) + def _check_path_matches_title_len(chapter_dict): # print(chapter_dict) - title = chapter_dict['title'] - m = re.search(r'\(len=(?P\d*)\)', title) - assert m, 'no len=?? found in title' - len_expected = int(m.groupdict()['len']) - len_observed = _get_pdf_len(chapter_dict['path']) - assert len_observed == len_expected, 'Wrong len detected in ' + str(chapter_dict) - + title = chapter_dict["title"] + m = re.search(r"\(len=(?P\d*)\)", title) + assert m, "no len=?? found in title" + len_expected = int(m.groupdict()["len"]) + len_observed = _get_pdf_len(chapter_dict["path"]) + assert len_observed == len_expected, "Wrong len detected in " + str(chapter_dict) diff --git a/tests/test_requests.py b/tests/test_requests.py index 11778570..2d051663 100644 --- a/tests/test_requests.py +++ b/tests/test_requests.py @@ -1,31 +1,39 @@ """ Tests for handling requests to Kolibri Studio """ - import copy -import pytest import uuid + +import pytest from le_utils.constants import licenses -from ricecooker.classes.nodes import TopicNode, DocumentNode -from ricecooker.managers.tree import ChannelManager + +from ricecooker.classes.nodes import DocumentNode +from ricecooker.classes.nodes import TopicNode from ricecooker.exceptions import InvalidNodeException +from ricecooker.managers.tree import ChannelManager """ *********** TOPIC FIXTURES *********** """ + + @pytest.fixture def topic_id(): return "topic-id" + @pytest.fixture def topic_content_id(channel_domain_namespace, topic_id): return uuid.uuid5(channel_domain_namespace, topic_id) + @pytest.fixture def topic_node_id(channel_node_id, topic_content_id): return uuid.uuid5(channel_node_id, topic_content_id.hex) + @pytest.fixture def topic(topic_id): return TopicNode(topic_id, "Topic") + @pytest.fixture def invalid_topic(topic_id): topic = TopicNode(topic_id, "Topic") @@ -33,8 +41,9 @@ def invalid_topic(topic_id): return topic - """ *********** LOCAL DOCUMENT FIXTURES *********** """ + + @pytest.fixture def invalid_document(document_file): node = DocumentNode("invalid", "Document", licenses.CC_BY, files=[document_file]) @@ -43,18 +52,22 @@ def invalid_document(document_file): """ *********** TREE FIXTURES *********** """ + + @pytest.fixture def tree(channel, topic, document): topic.add_child(document) channel.add_child(topic) return ChannelManager(channel) + @pytest.fixture def invalid_tree(invalid_channel, invalid_topic, invalid_document): invalid_topic.add_child(invalid_document) invalid_channel.add_child(invalid_topic) return ChannelManager(invalid_channel) + @pytest.fixture def invalid_tree_2(channel, topic, invalid_document): channel_copy = copy.deepcopy(channel) @@ -65,37 +78,49 @@ def invalid_tree_2(channel, topic, invalid_document): """ TESTS """ + + def test_validate(tree, invalid_tree, invalid_tree_2): assert tree.validate(), "Tree should pass validation" pytest.raises(InvalidNodeException, invalid_tree.validate) pytest.raises(InvalidNodeException, invalid_tree_2.validate) + def test_check_for_files_failed(): assert True + def test_get_file_diff(): assert True + def test_upload_files(): assert True + def test_reattempt_upload_fails(): assert True + def test_upload_tree(): assert True + def test_reattempt_failed(): assert True + def test_add_channel(): assert True + def test_add_nodes(): assert True + def test_commit_channel(): assert True + def test_publish(): assert True diff --git a/tests/test_settings.py b/tests/test_settings.py index 56c665dd..2e2a0656 100644 --- a/tests/test_settings.py +++ b/tests/test_settings.py @@ -5,10 +5,7 @@ from ricecooker import chefs -settings = { - 'generate-missing-thumbnails': True, - 'compress-videos': True -} +settings = {"generate-missing-thumbnails": True, "compress-videos": True} def test_settings_unset_default(): @@ -35,45 +32,45 @@ def test_cli_args_override_settings(): takes precedence over the default setting. """ - test_argv = ['sushichef.py', '--compress', '--thumbnails', '--token', '12345'] + test_argv = ["sushichef.py", "--compress", "--thumbnails", "--token", "12345"] - with patch.object(sys, 'argv', test_argv): + with patch.object(sys, "argv", test_argv): chef = chefs.SushiChef() - chef.SETTINGS['generate-missing-thumbnails'] = False - chef.SETTINGS['compress-videos'] = False + chef.SETTINGS["generate-missing-thumbnails"] = False + chef.SETTINGS["compress-videos"] = False - assert chef.get_setting('generate-missing-thumbnails') == False - assert chef.get_setting('compress-videos') == False + assert chef.get_setting("generate-missing-thumbnails") == False + assert chef.get_setting("compress-videos") == False chef.parse_args_and_options() - assert chef.get_setting('generate-missing-thumbnails') == True - assert chef.get_setting('compress-videos') == True + assert chef.get_setting("generate-missing-thumbnails") == True + assert chef.get_setting("compress-videos") == True - test_argv = ['sushichef.py', '--compress', '--thumbnails', '--token', '12345'] + test_argv = ["sushichef.py", "--compress", "--thumbnails", "--token", "12345"] - with patch.object(sys, 'argv', test_argv): + with patch.object(sys, "argv", test_argv): chef = chefs.SushiChef() assert len(chef.SETTINGS) == 0 - assert chef.get_setting('generate-missing-thumbnails') == None - assert chef.get_setting('compress-videos') == None + assert chef.get_setting("generate-missing-thumbnails") == None + assert chef.get_setting("compress-videos") == None chef.parse_args_and_options() - assert chef.get_setting('generate-missing-thumbnails') == True - assert chef.get_setting('compress-videos') == True + assert chef.get_setting("generate-missing-thumbnails") == True + assert chef.get_setting("compress-videos") == True # now test without setting the flags - test_argv = ['sushichef.py', '--token', '12345'] + test_argv = ["sushichef.py", "--token", "12345"] - with patch.object(sys, 'argv', test_argv): + with patch.object(sys, "argv", test_argv): chef = chefs.SushiChef() - chef.SETTINGS['generate-missing-thumbnails'] = False - chef.SETTINGS['compress-videos'] = False + chef.SETTINGS["generate-missing-thumbnails"] = False + chef.SETTINGS["compress-videos"] = False - assert chef.get_setting('generate-missing-thumbnails') == False - assert chef.get_setting('compress-videos') == False + assert chef.get_setting("generate-missing-thumbnails") == False + assert chef.get_setting("compress-videos") == False chef.parse_args_and_options() - assert chef.get_setting('generate-missing-thumbnails') == False - assert chef.get_setting('compress-videos') == False + assert chef.get_setting("generate-missing-thumbnails") == False + assert chef.get_setting("compress-videos") == False diff --git a/tests/test_thumbnails.py b/tests/test_thumbnails.py index eea45daf..d0a1990f 100644 --- a/tests/test_thumbnails.py +++ b/tests/test_thumbnails.py @@ -1,24 +1,34 @@ import os + import PIL import pytest - from le_utils.constants import licenses +from test_tree import thumbnail_path +from test_tree import thumbnail_path_jpg +from test_videos import _clear_ricecookerfilecache +from test_videos import low_res_video + from ricecooker import config -from ricecooker.classes.files import AudioFile, DocumentFile, HTMLZipFile, ThumbnailFile, TiledThumbnailFile, VideoFile -from ricecooker.classes.nodes import AudioNode, DocumentNode, HTML5AppNode, TopicNode, VideoNode +from ricecooker.classes.files import AudioFile +from ricecooker.classes.files import DocumentFile +from ricecooker.classes.files import HTMLZipFile +from ricecooker.classes.files import ThumbnailFile +from ricecooker.classes.files import TiledThumbnailFile +from ricecooker.classes.files import VideoFile +from ricecooker.classes.nodes import AudioNode +from ricecooker.classes.nodes import DocumentNode +from ricecooker.classes.nodes import HTML5AppNode +from ricecooker.classes.nodes import TopicNode +from ricecooker.classes.nodes import VideoNode -from test_tree import thumbnail_path, thumbnail_path_jpg -from test_videos import low_res_video -from test_videos import _clear_ricecookerfilecache +SHOW_THUMBS = False # set to True to show outputs when running tests locally -SHOW_THUMBS = False # set to True to show outputs when running tests locally +THUMBNAIL_URL = "https://raw.githubusercontent.com/learningequality/ricecooker/master/tests/testcontent/samples/thumbnail.png" -THUMBNAIL_URL = 'https://raw.githubusercontent.com/learningequality/ricecooker/master/tests/testcontent/samples/thumbnail.png' class TestThumbnailSetting(object): - def setup_method(self, test_method): """ Called before each test method executes. @@ -27,61 +37,70 @@ def setup_method(self, test_method): config.FAILED_FILES = [] def get_video_node(self, path, thumbnail=None): - video_file = VideoFile(path, language='en') - video_node = VideoNode('vid-src-id', "Video", licenses.PUBLIC_DOMAIN, thumbnail=thumbnail) + video_file = VideoFile(path, language="en") + video_node = VideoNode( + "vid-src-id", "Video", licenses.PUBLIC_DOMAIN, thumbnail=thumbnail + ) video_node.add_file(video_file) return video_node def check_correct_thumbnail(self, node): - expected_thumbnail_filename = 'eb79354ddd5774bb3436f9a19c282bff.png' + expected_thumbnail_filename = "eb79354ddd5774bb3436f9a19c282bff.png" thumbnail_files = [f for f in node.files if isinstance(f, ThumbnailFile)] - assert len(thumbnail_files) == 1, 'multiple thumbnails found' + assert len(thumbnail_files) == 1, "multiple thumbnails found" thumbnail_file = thumbnail_files[0] thumbnail_filename = thumbnail_file.get_filename() assert thumbnail_filename == expected_thumbnail_filename, "Wrong thumbnail" def assert_failed_thumbnail(self, node): thumbnail_files = [f for f in node.files if isinstance(f, ThumbnailFile)] - assert len(thumbnail_files) == 1, 'multiple thumbnails found' + assert len(thumbnail_files) == 1, "multiple thumbnails found" thumbnail_file = thumbnail_files[0] - assert thumbnail_file.filename == None, 'filename should be None' + assert thumbnail_file.filename == None, "filename should be None" failed_files = config.FAILED_FILES # for ff in failed_files: # print(ff, ff.path, ff.error) - assert len(failed_files) == 1, 'multiple failed files found' + assert len(failed_files) == 1, "multiple failed files found" failed_file = failed_files[0] - assert failed_file.error, 'must have error set' - assert thumbnail_file == failed_file, 'bad thumbnail file not found in config.FAILED_FILES' - + assert failed_file.error, "must have error set" + assert ( + thumbnail_file == failed_file + ), "bad thumbnail file not found in config.FAILED_FILES" # HAPPY PATHS ############################################################################ def test_set_png_thumbnail_from_local_path(self, low_res_video, thumbnail_path): - video_node = self.get_video_node(path=low_res_video.name, thumbnail=thumbnail_path) + video_node = self.get_video_node( + path=low_res_video.name, thumbnail=thumbnail_path + ) video_node.validate() _ = video_node.process_files() self.check_correct_thumbnail(video_node) def test_set_jpg_thumbnail_from_local_path(self, low_res_video, thumbnail_path_jpg): - video_node = self.get_video_node(path=low_res_video.name, thumbnail=thumbnail_path_jpg) + video_node = self.get_video_node( + path=low_res_video.name, thumbnail=thumbnail_path_jpg + ) video_node.validate() _ = video_node.process_files() - expected_thumbnail_filename = 'd7ab03e4263fc374737d96ac2da156c1.jpg' + expected_thumbnail_filename = "d7ab03e4263fc374737d96ac2da156c1.jpg" thumbnail_files = [f for f in video_node.files if isinstance(f, ThumbnailFile)] - assert len(thumbnail_files) == 1, 'multiple thumbnails found' + assert len(thumbnail_files) == 1, "multiple thumbnails found" thumbnail_file = thumbnail_files[0] thumbnail_filename = thumbnail_file.get_filename() assert thumbnail_filename == expected_thumbnail_filename, "Wrong thumbnail" def test_set_thumbnail_from_url(self, low_res_video): - video_node = self.get_video_node(path=low_res_video.name, thumbnail=THUMBNAIL_URL) + video_node = self.get_video_node( + path=low_res_video.name, thumbnail=THUMBNAIL_URL + ) video_node.validate() _ = video_node.process_files() self.check_correct_thumbnail(video_node) def test_set_thumbnail_from_url_with_querystring(self, low_res_video): - url = THUMBNAIL_URL + '?querystringkey=querystringvalue' + url = THUMBNAIL_URL + "?querystringkey=querystringvalue" video_node = self.get_video_node(path=low_res_video.name, thumbnail=url) video_node.validate() _ = video_node.process_files() @@ -89,7 +108,9 @@ def test_set_thumbnail_from_url_with_querystring(self, low_res_video): def test_set_thumbnail_from_ThumbnailFile(self, low_res_video, thumbnail_path): thumbnail_file = ThumbnailFile(thumbnail_path) - video_node = self.get_video_node(path=low_res_video.name, thumbnail=thumbnail_file) + video_node = self.get_video_node( + path=low_res_video.name, thumbnail=thumbnail_file + ) video_node.validate() _ = video_node.process_files() self.check_correct_thumbnail(video_node) @@ -102,13 +123,14 @@ def test_add_ThumbnailFile(self, low_res_video, thumbnail_path): _ = video_node.process_files() self.check_correct_thumbnail(video_node) - # ERROR PATHS ############################################################################ def test_set_thumbnail_from_non_existent_path(self, low_res_video): - non_existent_path = 'does/not/exist.png' - video_node = self.get_video_node(path=low_res_video.name, thumbnail=non_existent_path) + non_existent_path = "does/not/exist.png" + video_node = self.get_video_node( + path=low_res_video.name, thumbnail=non_existent_path + ) video_node.validate() _ = video_node.process_files() self.assert_failed_thumbnail(video_node) @@ -117,15 +139,15 @@ def test_set_thumbnail_from_bad_path(self, low_res_video, fake_thumbnail_file): """ File path exists, but is not a valid PNG file. """ - video_node = self.get_video_node(path=low_res_video.name, thumbnail=fake_thumbnail_file) + video_node = self.get_video_node( + path=low_res_video.name, thumbnail=fake_thumbnail_file + ) video_node.validate() _ = video_node.process_files() self.assert_failed_thumbnail(video_node) - class TestThumbnailGeneration(object): - def setup_method(self, test_method): """ Called before each test method executes. @@ -135,8 +157,12 @@ def setup_method(self, test_method): config.THUMBNAILS = False def check_has_thumbnail(self, node): - thumbnail_files = [f for f in node.files if isinstance(f, ThumbnailFile) or isinstance(f, TiledThumbnailFile)] - assert len(thumbnail_files) == 1, 'expected single thumbnail' + thumbnail_files = [ + f + for f in node.files + if isinstance(f, ThumbnailFile) or isinstance(f, TiledThumbnailFile) + ] + assert len(thumbnail_files) == 1, "expected single thumbnail" thumbnail_file = thumbnail_files[0] thumbnail_filename = thumbnail_file.get_filename() thumbnail_path = config.get_storage_path(thumbnail_filename) @@ -149,158 +175,180 @@ def check_has_thumbnail(self, node): def assert_failed_thumbnail(self, node): thumbnail_files = [f for f in node.files if isinstance(f, ThumbnailFile)] - assert len(thumbnail_files) == 1, 'multiple thumbnails found' + assert len(thumbnail_files) == 1, "multiple thumbnails found" thumbnail_file = thumbnail_files[0] - assert thumbnail_file.filename == None, 'filename should be None' + assert thumbnail_file.filename == None, "filename should be None" failed_files = config.FAILED_FILES # for ff in failed_files: # print(ff, ff.path, ff.error) - assert len(failed_files) == 1, 'multiple failed files found' + assert len(failed_files) == 1, "multiple failed files found" failed_file = failed_files[0] - assert failed_file.error, 'must have error set' - assert thumbnail_file == failed_file, 'bad thumbnail file not found in config.FAILED_FILES' - + assert failed_file.error, "must have error set" + assert ( + thumbnail_file == failed_file + ), "bad thumbnail file not found in config.FAILED_FILES" # HAPPY PATHS ############################################################################ def test_generate_thumbnail_from_pdf(self, document_file): - node = DocumentNode('doc-src-id', "Document", licenses.PUBLIC_DOMAIN, thumbnail=None) + node = DocumentNode( + "doc-src-id", "Document", licenses.PUBLIC_DOMAIN, thumbnail=None + ) node.add_file(document_file) config.THUMBNAILS = True filenames = node.process_files() - assert len(filenames) == 2, 'expected two filenames' + assert len(filenames) == 2, "expected two filenames" self.check_has_thumbnail(node) def test_generate_thumbnail_from_epub(self, epub_file): - node = DocumentNode('doc-src-id', "Document", licenses.PUBLIC_DOMAIN, thumbnail=None) + node = DocumentNode( + "doc-src-id", "Document", licenses.PUBLIC_DOMAIN, thumbnail=None + ) node.add_file(epub_file) config.THUMBNAILS = True filenames = node.process_files() - assert len(filenames) == 2, 'expected two filenames' + assert len(filenames) == 2, "expected two filenames" self.check_has_thumbnail(node) def test_generate_thumbnail_from_html(self, html_file): - node = HTML5AppNode('html-src-id', "HTML5 App", licenses.PUBLIC_DOMAIN, thumbnail=None) + node = HTML5AppNode( + "html-src-id", "HTML5 App", licenses.PUBLIC_DOMAIN, thumbnail=None + ) node.add_file(html_file) config.THUMBNAILS = True filenames = node.process_files() - assert len(filenames) == 2, 'expected two filenames' + assert len(filenames) == 2, "expected two filenames" self.check_has_thumbnail(node) def test_generate_thumbnail_from_video(self, video_file): - node = VideoNode('vid-src-id', "Video", licenses.PUBLIC_DOMAIN, thumbnail=None) + node = VideoNode("vid-src-id", "Video", licenses.PUBLIC_DOMAIN, thumbnail=None) node.add_file(video_file) config.THUMBNAILS = True filenames = node.process_files() - assert len(filenames) == 2, 'expected two filenames' + assert len(filenames) == 2, "expected two filenames" self.check_has_thumbnail(node) def test_generate_tiled_thumbnail(self, document, html, video, audio): - topic = TopicNode('test-topic', 'Topic') + topic = TopicNode("test-topic", "Topic") topic.add_child(document) topic.add_child(html) topic.add_child(video) topic.add_child(audio) config.THUMBNAILS = True - for child in topic.children: # must process children before topic node + for child in topic.children: # must process children before topic node child.process_files() filenames = topic.process_files() - assert len(filenames) == 1, 'expected one filename' + assert len(filenames) == 1, "expected one filename" self.check_has_thumbnail(topic) - # ERROR PATHS ############################################################################ def test_non_existent_pdf_fails(self): - node = DocumentNode('doc-src-id', "Document", licenses.PUBLIC_DOMAIN, thumbnail=None) - non_existent_path = 'does/not/exist.pdf' - document_file = DocumentFile(non_existent_path, language='en') + node = DocumentNode( + "doc-src-id", "Document", licenses.PUBLIC_DOMAIN, thumbnail=None + ) + non_existent_path = "does/not/exist.pdf" + document_file = DocumentFile(non_existent_path, language="en") node.add_file(document_file) config.THUMBNAILS = True filenames = node.process_files() - assert filenames == [None], 'expected one None (the non existent pdf)' - assert len(config.FAILED_FILES) == 1, 'expected one failed file' + assert filenames == [None], "expected one None (the non existent pdf)" + assert len(config.FAILED_FILES) == 1, "expected one failed file" def test_invalid_pdf_fails(self, invalid_document_file): - node = DocumentNode('doc-src-id', "Document", licenses.PUBLIC_DOMAIN, thumbnail=None) + node = DocumentNode( + "doc-src-id", "Document", licenses.PUBLIC_DOMAIN, thumbnail=None + ) node.add_file(invalid_document_file) config.THUMBNAILS = True filenames = node.process_files() # assert filenames == [None], 'expected one None filename (the broken pdf)' - assert len(config.FAILED_FILES) == 1, 'expected one failed file' - + assert len(config.FAILED_FILES) == 1, "expected one failed file" def test_non_existent_epub_fails(self): - node = DocumentNode('doc-src-id', "Document", licenses.PUBLIC_DOMAIN, thumbnail=None) - non_existent_path = 'does/not/exist.epub' - document_file = DocumentFile(non_existent_path, language='en') + node = DocumentNode( + "doc-src-id", "Document", licenses.PUBLIC_DOMAIN, thumbnail=None + ) + non_existent_path = "does/not/exist.epub" + document_file = DocumentFile(non_existent_path, language="en") node.add_file(document_file) config.THUMBNAILS = True filenames = node.process_files() - assert filenames == [None], 'expected one None (the non existent epub)' - assert len(config.FAILED_FILES) == 1, 'expected one failed file' + assert filenames == [None], "expected one None (the non existent epub)" + assert len(config.FAILED_FILES) == 1, "expected one failed file" def test_invalid_epub_fails(self, invalid_epub_file): - node = DocumentNode('doc-src-id', "Document", licenses.PUBLIC_DOMAIN, thumbnail=None) + node = DocumentNode( + "doc-src-id", "Document", licenses.PUBLIC_DOMAIN, thumbnail=None + ) node.add_file(invalid_epub_file) config.THUMBNAILS = True filenames = node.process_files() # assert filenames == [None], 'expected one None filename (the broken epub)' # TODO: implement epub deep validation - assert len(config.FAILED_FILES) == 1, 'expected one failed file' - + assert len(config.FAILED_FILES) == 1, "expected one failed file" def test_non_existent_htmlzip_fails(self): - node = HTML5AppNode('doc-src-id', "Document", licenses.PUBLIC_DOMAIN, thumbnail=None) - non_existent_path = 'does/not/exist.zip' - html_file = HTMLZipFile(non_existent_path, language='en') + node = HTML5AppNode( + "doc-src-id", "Document", licenses.PUBLIC_DOMAIN, thumbnail=None + ) + non_existent_path = "does/not/exist.zip" + html_file = HTMLZipFile(non_existent_path, language="en") node.add_file(html_file) config.THUMBNAILS = True filenames = node.process_files() - assert filenames == [None], 'expected one None filename (the broken zip)' - assert len(config.FAILED_FILES) == 1, 'expected one failed file' + assert filenames == [None], "expected one None filename (the broken zip)" + assert len(config.FAILED_FILES) == 1, "expected one failed file" def test_invalid_htmlzip_fails(self, html_invalid_file): - node = DocumentNode('doc-src-id', "Document", licenses.PUBLIC_DOMAIN, thumbnail=None) + node = DocumentNode( + "doc-src-id", "Document", licenses.PUBLIC_DOMAIN, thumbnail=None + ) node.add_file(html_invalid_file) config.THUMBNAILS = True filenames = node.process_files() - assert filenames == [None], 'expected one None filename (the broken html)' - assert len(config.FAILED_FILES) == 1, 'expected one failed file' - + assert filenames == [None], "expected one None filename (the broken html)" + assert len(config.FAILED_FILES) == 1, "expected one failed file" def test_non_existent_mp3_fails(self): - node = AudioNode('audio-src-id', "Document", licenses.PUBLIC_DOMAIN, thumbnail=None) - non_existent_path = 'does/not/exist.mp3' - document_file = AudioFile(non_existent_path, language='en') + node = AudioNode( + "audio-src-id", "Document", licenses.PUBLIC_DOMAIN, thumbnail=None + ) + non_existent_path = "does/not/exist.mp3" + document_file = AudioFile(non_existent_path, language="en") node.add_file(document_file) config.THUMBNAILS = True filenames = node.process_files() - assert filenames == [None], 'expected one None (the non existent mp3)' - assert len(config.FAILED_FILES) == 1, 'expected one failed file' + assert filenames == [None], "expected one None (the non existent mp3)" + assert len(config.FAILED_FILES) == 1, "expected one failed file" def test_invalid_mp3_fails(self, invalid_audio_file): - node = AudioNode('audio-src-id', "Document", licenses.PUBLIC_DOMAIN, thumbnail=None) + node = AudioNode( + "audio-src-id", "Document", licenses.PUBLIC_DOMAIN, thumbnail=None + ) node.add_file(invalid_audio_file) config.THUMBNAILS = True filenames = node.process_files() # assert filenames == [None], 'expected one None filename (the broken mp3)' # TODO: implement mp3 deep validation # assert len(config.FAILED_FILES) == 1, 'expected one failed file' - def test_non_existent_mp4_fails(self): - node = VideoNode('video-src-id', "Video", licenses.PUBLIC_DOMAIN, thumbnail=None) - non_existent_path = 'does/not/exist.mp4' - document_file = VideoFile(non_existent_path, language='en') + node = VideoNode( + "video-src-id", "Video", licenses.PUBLIC_DOMAIN, thumbnail=None + ) + non_existent_path = "does/not/exist.mp4" + document_file = VideoFile(non_existent_path, language="en") node.add_file(document_file) config.THUMBNAILS = True filenames = node.process_files() - assert filenames == [None], 'expected one None (the non existent mp4)' - assert len(config.FAILED_FILES) == 1, 'expected one failed file' + assert filenames == [None], "expected one None (the non existent mp4)" + assert len(config.FAILED_FILES) == 1, "expected one failed file" def test_invalid_mp4_fails(self, invalid_video_file): - node = VideoNode('video-src-id', "Document", licenses.PUBLIC_DOMAIN, thumbnail=None) + node = VideoNode( + "video-src-id", "Document", licenses.PUBLIC_DOMAIN, thumbnail=None + ) node.add_file(invalid_video_file) config.THUMBNAILS = True filenames = node.process_files() diff --git a/tests/test_tree.py b/tests/test_tree.py index 0a5b843a..2e5d6927 100644 --- a/tests/test_tree.py +++ b/tests/test_tree.py @@ -1,16 +1,18 @@ """ Tests for tree construction """ - import copy -import pytest import uuid -from le_utils.constants import licenses, file_types -from ricecooker.classes.nodes import * + +import pytest +from le_utils.constants import file_types +from le_utils.constants import licenses +from le_utils.constants.languages import getlang + from ricecooker.classes.files import * from ricecooker.classes.licenses import * +from ricecooker.classes.nodes import * from ricecooker.exceptions import InvalidNodeException -from ricecooker.utils.zip import create_predictable_zip from ricecooker.utils.jsontrees import build_tree_from_json -from le_utils.constants.languages import getlang +from ricecooker.utils.zip import create_predictable_zip """ *********** TOPIC FIXTURES *********** """ @@ -81,8 +83,12 @@ def license_name(): @pytest.fixture -def document(document_id, document_file, thumbnail_path, copyright_holder, license_name): - node = DocumentNode(document_id, "Document", licenses.CC_BY, thumbnail=thumbnail_path) +def document( + document_id, document_file, thumbnail_path, copyright_holder, license_name +): + node = DocumentNode( + document_id, "Document", licenses.CC_BY, thumbnail=thumbnail_path + ) node.add_file(document_file) node.set_license(license_name, copyright_holder=copyright_holder) return node @@ -132,22 +138,42 @@ def test_nodes_initialized(channel, topic, document): def test_add_child(tree, topic, document): assert tree.children[0] == topic, "Channel should have topic child node" - assert tree.children[0].children[0] == document, "Topic should have a document child node" - - -def test_ids(tree, channel_node_id, channel_content_id, topic_content_id, topic_node_id, document_content_id, - document_node_id): + assert ( + tree.children[0].children[0] == document + ), "Topic should have a document child node" + + +def test_ids( + tree, + channel_node_id, + channel_content_id, + topic_content_id, + topic_node_id, + document_content_id, + document_node_id, +): channel = tree topic = tree.children[0] document = topic.children[0] - assert channel.get_content_id() == channel_content_id, "Channel content id should be {}".format(channel_content_id) - assert channel.get_node_id() == channel_node_id, "Channel node id should be {}".format(channel_node_id) - assert topic.get_content_id() == topic_content_id, "Topic content id should be {}".format(topic_content_id) - assert topic.get_node_id() == topic_node_id, "Topic node id should be {}".format(topic_node_id) - assert document.get_content_id() == document_content_id, "Document content id should be {}".format( - document_content_id) - assert document.get_node_id() == document_node_id, "Document node id should be {}".format(document_node_id) + assert ( + channel.get_content_id() == channel_content_id + ), "Channel content id should be {}".format(channel_content_id) + assert ( + channel.get_node_id() == channel_node_id + ), "Channel node id should be {}".format(channel_node_id) + assert ( + topic.get_content_id() == topic_content_id + ), "Topic content id should be {}".format(topic_content_id) + assert topic.get_node_id() == topic_node_id, "Topic node id should be {}".format( + topic_node_id + ) + assert ( + document.get_content_id() == document_content_id + ), "Document content id should be {}".format(document_content_id) + assert ( + document.get_node_id() == document_node_id + ), "Document node id should be {}".format(document_node_id) def test_add_file(document, document_file): @@ -159,8 +185,9 @@ def test_add_file(document, document_file): def test_thumbnail(topic, document, thumbnail_path): assert document.has_thumbnail(), "Document must have a thumbnail" assert not topic.has_thumbnail(), "Topic must not have a thumbnail" - assert [f for f in document.files if - f.path == thumbnail_path], "Document is missing a thumbnail with path {}".format(thumbnail_path) + assert [ + f for f in document.files if f.path == thumbnail_path + ], "Document is missing a thumbnail with path {}".format(thumbnail_path) def test_count(tree): @@ -168,14 +195,23 @@ def test_count(tree): def test_get_non_topic_descendants(tree, document): - assert tree.get_non_topic_descendants() == [document], "Channel should only have 1 non-topic descendant" + assert tree.get_non_topic_descendants() == [ + document + ], "Channel should only have 1 non-topic descendant" def test_licenses(channel, topic, document, license_name, copyright_holder): - assert isinstance(document.license, License), "Document should have a license object" - assert document.license.license_id == license_name, "Document license should have public domain license" - assert document.license.copyright_holder == copyright_holder, "Document license should have copyright holder set to {}".format( - copyright_holder) + assert isinstance( + document.license, License + ), "Document should have a license object" + assert ( + document.license.license_id == license_name + ), "Document license should have public domain license" + assert ( + document.license.copyright_holder == copyright_holder + ), "Document license should have copyright holder set to {}".format( + copyright_holder + ) assert not channel.license, "Channel should not have a license" assert not topic.license, "Topic should not have a license" @@ -204,7 +240,7 @@ def test_add_files_with_preset(channel): kind=content_kinds.TOPIC, source_id="test:container", title="test title", - language=getlang('ar').code, + language=getlang("ar").code, children=[], ) audio_path = os.path.join("tests/media_utils/audio/file_example_MP3_700KB.mp3") @@ -212,11 +248,11 @@ def test_add_files_with_preset(channel): audio_node = dict( kind=content_kinds.AUDIO, source_id="audio_node", - title='audio_node', - description='audio_node description', + title="audio_node", + description="audio_node description", language=getlang("ar").code, - license=get_license('CC BY', copyright_holder='Demo Holdings').as_dict(), - author='author name', + license=get_license("CC BY", copyright_holder="Demo Holdings").as_dict(), + author="author name", files=[ { "file_type": file_types.AUDIO, @@ -227,20 +263,23 @@ def test_add_files_with_preset(channel): ) inputdir = tempfile.mkdtemp() - with open(os.path.join(inputdir, 'index.html'), 'w') as testf: - testf.write('something something') + with open(os.path.join(inputdir, "index.html"), "w") as testf: + testf.write("something something") zip_path = create_predictable_zip(inputdir) - files = [{ - "file_type": file_types.HTML5, - "path": zip_path, - "language": getlang("ar").code, - }, { - "file_type": file_types.AUDIO, - "path": audio_path, - "language": getlang("ar").code, - "preset": format_presets.AUDIO_DEPENDENCY - }] + files = [ + { + "file_type": file_types.HTML5, + "path": zip_path, + "language": getlang("ar").code, + }, + { + "file_type": file_types.AUDIO, + "path": audio_path, + "language": getlang("ar").code, + "preset": format_presets.AUDIO_DEPENDENCY, + }, + ] html5_dict = dict( kind=content_kinds.HTML5, @@ -248,10 +287,10 @@ def test_add_files_with_preset(channel): title="source_test_id", description="test_description", language=getlang("ar").code, - license=get_license('CC BY', copyright_holder='Demo Holdings').as_dict(), + license=get_license("CC BY", copyright_holder="Demo Holdings").as_dict(), author="Test author", thumbnail="tests/testcontent/samples/thumbnail.jpg", - files=files + files=files, ) topic_node["children"].append(html5_dict) topic_node["children"].append(audio_node) @@ -261,7 +300,7 @@ def test_add_files_with_preset(channel): assert parent_node.validate_tree() assert parent_node assert parent_node.children[0] - assert topic_node.kind == 'topic' + assert topic_node.kind == "topic" assert len(html5_node.files) == 3 assert html5_node.files[2].get_preset() == format_presets.AUDIO_DEPENDENCY @@ -273,46 +312,45 @@ def test_slideshow_node_via_files(channel): slideshow_node = SlideshowNode( title="The Slideshow", description="Slideshow Content Demo", - source_id='demo', + source_id="demo", author="DE Mo", - language='en', - license=get_license('CC BY', copyright_holder='Demo Holdings'), + language="en", + license=get_license("CC BY", copyright_holder="Demo Holdings"), files=[ SlideImageFile( - path='tests/testcontent/samples/thumbnail.jpg', - language='en', + path="tests/testcontent/samples/thumbnail.jpg", + language="en", caption="Demo blocks are neat.", - descriptive_text="Demo blocks are neat." + descriptive_text="Demo blocks are neat.", ), SlideImageFile( - path='tests/testcontent/samples/thumbnail.jpg', - language='en', + path="tests/testcontent/samples/thumbnail.jpg", + language="en", caption="Touch the demo to learn new things!", - descriptive_text="Touch the demo to learn new things!" + descriptive_text="Touch the demo to learn new things!", ), SlideImageFile( - path='tests/testcontent/samples/thumbnail.jpg', - language='en', + path="tests/testcontent/samples/thumbnail.jpg", + language="en", caption="Made mostly with Python!", - descriptive_text="Made mostly with Python!" + descriptive_text="Made mostly with Python!", ), SlideImageFile( - path='tests/testcontent/samples/thumbnail.jpg', - language='en', + path="tests/testcontent/samples/thumbnail.jpg", + language="en", caption="Unlock your potential with this demo.", - descriptive_text="Unlock your potential with this demo." + descriptive_text="Unlock your potential with this demo.", ), ThumbnailFile( - path='tests/testcontent/samples/thumbnail.png', - language='en' - ) - ] + path="tests/testcontent/samples/thumbnail.png", language="en" + ), + ], ) assert slideshow_node - assert slideshow_node.kind == 'slideshow' - assert len(slideshow_node.files) == 5, 'missing files' - assert slideshow_node.extra_fields, 'missing extra_fields' - assert 'slideshow_data' in slideshow_node.extra_fields, 'missing slideshow_data key' + assert slideshow_node.kind == "slideshow" + assert len(slideshow_node.files) == 5, "missing files" + assert slideshow_node.extra_fields, "missing extra_fields" + assert "slideshow_data" in slideshow_node.extra_fields, "missing slideshow_data key" slideshow_node.process_files() channel.add_child(slideshow_node) assert channel.validate_tree() @@ -323,35 +361,34 @@ def test_slideshow_node_via_add_file(channel): slideshow_node = SlideshowNode( title="The Slideshow via add_files", description="Slideshow Content Demo", - source_id='demo2', + source_id="demo2", author="DE Mo", - language='en', - license=get_license('CC BY', copyright_holder='Demo Holdings'), - files=[] + language="en", + license=get_license("CC BY", copyright_holder="Demo Holdings"), + files=[], ) slideimg1 = SlideImageFile( - path='tests/testcontent/samples/thumbnail.jpg', - language='en', + path="tests/testcontent/samples/thumbnail.jpg", + language="en", caption="Demo blocks are neat.", - descriptive_text="Demo blocks are neat." + descriptive_text="Demo blocks are neat.", ) slideshow_node.add_file(slideimg1) slideimg2 = SlideImageFile( - path='tests/testcontent/samples/thumbnail.jpg', - language='en', + path="tests/testcontent/samples/thumbnail.jpg", + language="en", caption="Touch the demo to learn new things!", - descriptive_text="Touch the demo to learn new things!" + descriptive_text="Touch the demo to learn new things!", ) slideshow_node.add_file(slideimg2) thumbimg1 = ThumbnailFile( - path='tests/testcontent/samples/thumbnail.jpg', - language='en' + path="tests/testcontent/samples/thumbnail.jpg", language="en" ) slideshow_node.add_file(thumbimg1) # print(slideshow_node.__dict__) assert slideshow_node - assert len(slideshow_node.files) == 3, 'missing files' + assert len(slideshow_node.files) == 3, "missing files" channel.add_child(slideshow_node) assert channel.validate_tree() @@ -362,35 +399,37 @@ def test_slideshow_node_via_add_file(channel): def test_custom_navigation_node_via_files(channel): inputdir = tempfile.mkdtemp() - with open(os.path.join(inputdir, 'index.html'), 'w') as testf: - testf.write('something something') + with open(os.path.join(inputdir, "index.html"), "w") as testf: + testf.write("something something") zip_path = create_predictable_zip(inputdir) custom_navigation_node = CustomNavigationNode( title="The Nav App", description="Custom Navigation Content Demo", - source_id='demo', + source_id="demo", author="DE Mo", - language='en', - license=get_license('CC BY', copyright_holder='Demo Holdings'), + language="en", + license=get_license("CC BY", copyright_holder="Demo Holdings"), files=[ HTMLZipFile( path=zip_path, - language='en', + language="en", ), ThumbnailFile( - path='tests/testcontent/samples/thumbnail.png', - language='en' - ) - ] + path="tests/testcontent/samples/thumbnail.png", language="en" + ), + ], ) assert custom_navigation_node - assert custom_navigation_node.kind == 'topic' - assert len(custom_navigation_node.files) == 2, 'missing files' - assert custom_navigation_node.extra_fields, 'missing extra_fields' - assert "options" in custom_navigation_node.extra_fields and 'modality' in custom_navigation_node.extra_fields[ - "options"] and custom_navigation_node.extra_fields["options"][ - "modality"] == "CUSTOM_NAVIGATION", 'missing custom navigation modality' + assert custom_navigation_node.kind == "topic" + assert len(custom_navigation_node.files) == 2, "missing files" + assert custom_navigation_node.extra_fields, "missing extra_fields" + assert ( + "options" in custom_navigation_node.extra_fields + and "modality" in custom_navigation_node.extra_fields["options"] + and custom_navigation_node.extra_fields["options"]["modality"] + == "CUSTOM_NAVIGATION" + ), "missing custom navigation modality" custom_navigation_node.process_files() channel.add_child(custom_navigation_node) assert channel.validate_tree() @@ -399,36 +438,38 @@ def test_custom_navigation_node_via_files(channel): def test_custom_navigation_node_via_add_file(channel): inputdir = tempfile.mkdtemp() - with open(os.path.join(inputdir, 'index.html'), 'w') as testf: - testf.write('something something') + with open(os.path.join(inputdir, "index.html"), "w") as testf: + testf.write("something something") zip_path = create_predictable_zip(inputdir) custom_navigation_node = CustomNavigationNode( title="The Slideshow via add_files", description="Slideshow Content Demo", - source_id='demo2', + source_id="demo2", author="DE Mo", - language='en', - license=get_license('CC BY', copyright_holder='Demo Holdings'), - files=[] + language="en", + license=get_license("CC BY", copyright_holder="Demo Holdings"), + files=[], ) zipfile = HTMLZipFile( path=zip_path, - language='en', + language="en", ) custom_navigation_node.add_file(zipfile) thumbimg1 = ThumbnailFile( - path='tests/testcontent/samples/thumbnail.jpg', - language='en' + path="tests/testcontent/samples/thumbnail.jpg", language="en" ) custom_navigation_node.add_file(thumbimg1) assert custom_navigation_node - assert custom_navigation_node.kind == 'topic' - assert len(custom_navigation_node.files) == 2, 'missing files' - assert custom_navigation_node.extra_fields, 'missing extra_fields' - assert "options" in custom_navigation_node.extra_fields and 'modality' in custom_navigation_node.extra_fields[ - "options"] and custom_navigation_node.extra_fields["options"][ - "modality"] == "CUSTOM_NAVIGATION", 'missing custom navigation modality' + assert custom_navigation_node.kind == "topic" + assert len(custom_navigation_node.files) == 2, "missing files" + assert custom_navigation_node.extra_fields, "missing extra_fields" + assert ( + "options" in custom_navigation_node.extra_fields + and "modality" in custom_navigation_node.extra_fields["options"] + and custom_navigation_node.extra_fields["options"]["modality"] + == "CUSTOM_NAVIGATION" + ), "missing custom navigation modality" custom_navigation_node.process_files() channel.add_child(custom_navigation_node) assert channel.validate_tree() @@ -440,81 +481,89 @@ def test_custom_navigation_node_via_add_file(channel): def test_custom_navigation_channel_node_via_files(): inputdir = tempfile.mkdtemp() - with open(os.path.join(inputdir, 'index.html'), 'w') as testf: - testf.write('something something') + with open(os.path.join(inputdir, "index.html"), "w") as testf: + testf.write("something something") zip_path = create_predictable_zip(inputdir) zipfile = HTMLZipFile( path=zip_path, - language='en', + language="en", ) thumbimg1 = ThumbnailFile( - path='tests/testcontent/samples/thumbnail.png', - language='en' + path="tests/testcontent/samples/thumbnail.png", language="en" ) custom_navigation_channel_node = CustomNavigationChannelNode( title="The Nav App", description="Custom Navigation Content Demo", - source_id='demo', - source_domain='DEMO', - language='en', + source_id="demo", + source_domain="DEMO", + language="en", files=[ zipfile, thumbimg1, - ] + ], ) assert custom_navigation_channel_node - assert custom_navigation_channel_node.kind == 'Channel' - assert len(custom_navigation_channel_node.files) == 2, 'missing files' - assert custom_navigation_channel_node.extra_fields, 'missing extra_fields' - assert 'options' in custom_navigation_channel_node.extra_fields and 'modality' in \ - custom_navigation_channel_node.extra_fields["options"] and \ - custom_navigation_channel_node.extra_fields["options"][ - "modality"] == "CUSTOM_NAVIGATION", 'missing custom navigation modality' + assert custom_navigation_channel_node.kind == "Channel" + assert len(custom_navigation_channel_node.files) == 2, "missing files" + assert custom_navigation_channel_node.extra_fields, "missing extra_fields" + assert ( + "options" in custom_navigation_channel_node.extra_fields + and "modality" in custom_navigation_channel_node.extra_fields["options"] + and custom_navigation_channel_node.extra_fields["options"]["modality"] + == "CUSTOM_NAVIGATION" + ), "missing custom navigation modality" custom_navigation_channel_node.set_thumbnail(thumbimg1) custom_navigation_channel_node.process_files() assert custom_navigation_channel_node.validate_tree() assert custom_navigation_channel_node.to_dict() assert custom_navigation_channel_node.to_dict()["thumbnail"] == thumbimg1.filename assert len(custom_navigation_channel_node.to_dict()["files"]) == 1 - assert custom_navigation_channel_node.to_dict()["files"][0]["filename"] == zipfile.filename + assert ( + custom_navigation_channel_node.to_dict()["files"][0]["filename"] + == zipfile.filename + ) def test_custom_navigation_channel_node_via_add_file(): inputdir = tempfile.mkdtemp() - with open(os.path.join(inputdir, 'index.html'), 'w') as testf: - testf.write('something something') + with open(os.path.join(inputdir, "index.html"), "w") as testf: + testf.write("something something") zip_path = create_predictable_zip(inputdir) custom_navigation_channel_node = CustomNavigationChannelNode( title="The Slideshow via add_files", description="Slideshow Content Demo", - source_id='demo2', - source_domain='DEMO', - language='en', - files=[] + source_id="demo2", + source_domain="DEMO", + language="en", + files=[], ) zipfile = HTMLZipFile( path=zip_path, - language='en', + language="en", ) custom_navigation_channel_node.add_file(zipfile) thumbimg1 = ThumbnailFile( - path='tests/testcontent/samples/thumbnail.jpg', - language='en' + path="tests/testcontent/samples/thumbnail.jpg", language="en" ) custom_navigation_channel_node.add_file(thumbimg1) assert custom_navigation_channel_node - assert custom_navigation_channel_node.kind == 'Channel' - assert len(custom_navigation_channel_node.files) == 2, 'missing files' - assert custom_navigation_channel_node.extra_fields, 'missing extra_fields' - assert 'options' in custom_navigation_channel_node.extra_fields and 'modality' in \ - custom_navigation_channel_node.extra_fields["options"] and \ - custom_navigation_channel_node.extra_fields["options"][ - "modality"] == "CUSTOM_NAVIGATION", 'missing custom navigation modality' + assert custom_navigation_channel_node.kind == "Channel" + assert len(custom_navigation_channel_node.files) == 2, "missing files" + assert custom_navigation_channel_node.extra_fields, "missing extra_fields" + assert ( + "options" in custom_navigation_channel_node.extra_fields + and "modality" in custom_navigation_channel_node.extra_fields["options"] + and custom_navigation_channel_node.extra_fields["options"]["modality"] + == "CUSTOM_NAVIGATION" + ), "missing custom navigation modality" custom_navigation_channel_node.set_thumbnail(thumbimg1) custom_navigation_channel_node.process_files() assert custom_navigation_channel_node.validate_tree() assert custom_navigation_channel_node.to_dict() assert custom_navigation_channel_node.to_dict()["thumbnail"] == thumbimg1.filename assert len(custom_navigation_channel_node.to_dict()["files"]) == 1 - assert custom_navigation_channel_node.to_dict()["files"][0]["filename"] == zipfile.filename + assert ( + custom_navigation_channel_node.to_dict()["files"][0]["filename"] + == zipfile.filename + ) diff --git a/tests/test_videos.py b/tests/test_videos.py index 71bd30c8..36939d6b 100644 --- a/tests/test_videos.py +++ b/tests/test_videos.py @@ -1,90 +1,113 @@ from __future__ import print_function -from cachecontrol.caches.file_cache import FileCache + import os -import pytest import re import shutil import subprocess +import pytest +from cachecontrol.caches.file_cache import FileCache +from conftest import download_fixture_file from le_utils.constants import format_presets from le_utils.constants import licenses + from ricecooker import config from ricecooker.classes.files import FILECACHE -from ricecooker.classes.files import SubtitleFile, VideoFile +from ricecooker.classes.files import SubtitleFile +from ricecooker.classes.files import VideoFile from ricecooker.classes.nodes import VideoNode -from conftest import download_fixture_file - @pytest.fixture def low_res_video(): - source_url = "https://archive.org/download/vd_is_for_everybody/vd_is_for_everybody_512kb.mp4" + source_url = ( + "https://archive.org/download/vd_is_for_everybody/vd_is_for_everybody_512kb.mp4" + ) local_path = os.path.join("tests", "testcontent", "downloaded", "low_res_video.mp4") download_fixture_file(source_url, local_path) assert os.path.exists(local_path) - f = open(local_path, 'rb') + f = open(local_path, "rb") f.close() return f # returns a closed file descriptor which we use for name attribute + @pytest.fixture def low_res_video_webm(): - source_url = "https://ia801800.us.archive.org/28/items/rick-astley-never-gonna-give-you-up-video_202012/" \ - "Rick%20Astley%20-%20Never%20Gonna%20Give%20You%20Up%20Video.webm" - local_path = os.path.join("tests", "testcontent", "downloaded", "low_res_video.webm") + source_url = ( + "https://ia801800.us.archive.org/28/items/rick-astley-never-gonna-give-you-up-video_202012/" + "Rick%20Astley%20-%20Never%20Gonna%20Give%20You%20Up%20Video.webm" + ) + local_path = os.path.join( + "tests", "testcontent", "downloaded", "low_res_video.webm" + ) download_fixture_file(source_url, local_path) assert os.path.exists(local_path) - f = open(local_path, 'rb') + f = open(local_path, "rb") f.close() return f # returns a closed file descriptor which we use for name attribute + @pytest.fixture def high_res_video(): - source_url = "https://ia800201.us.archive.org/7/items/" \ - "UnderConstructionFREEVideoBackgroundLoopHD1080p/" \ - "UnderConstruction%20-%20FREE%20Video%20Background%20Loop%20HD%201080p.mp4" - local_path = os.path.join("tests", "testcontent", "downloaded", "high_res_video.mp4") + source_url = ( + "https://ia800201.us.archive.org/7/items/" + "UnderConstructionFREEVideoBackgroundLoopHD1080p/" + "UnderConstruction%20-%20FREE%20Video%20Background%20Loop%20HD%201080p.mp4" + ) + local_path = os.path.join( + "tests", "testcontent", "downloaded", "high_res_video.mp4" + ) download_fixture_file(source_url, local_path) assert os.path.exists(local_path) - f = open(local_path, 'rb') + f = open(local_path, "rb") f.close() return f # returns a closed file descriptor which we use for name attribute + @pytest.fixture def high_res_video_webm(): source_url = "https://mirrors.creativecommons.org/movingimages/webm/CreativeCommonsPlusCommercial_720p.webm" - local_path = os.path.join("tests", "testcontent", "downloaded", "high_res_video.webm") + local_path = os.path.join( + "tests", "testcontent", "downloaded", "high_res_video.webm" + ) download_fixture_file(source_url, local_path) assert os.path.exists(local_path) - f = open(local_path, 'rb') + f = open(local_path, "rb") f.close() return f # returns a closed file descriptor which we use for name attribute + @pytest.fixture def low_res_ogv_video(): - source_url = "https://archive.org/download/" \ - "UnderConstructionFREEVideoBackgroundLoopHD1080p/" \ - "UnderConstruction%20-%20FREE%20Video%20Background%20Loop%20HD%201080p.ogv" - local_path = os.path.join("tests", "testcontent", "downloaded", "low_res_ogv_video.ogv") + source_url = ( + "https://archive.org/download/" + "UnderConstructionFREEVideoBackgroundLoopHD1080p/" + "UnderConstruction%20-%20FREE%20Video%20Background%20Loop%20HD%201080p.ogv" + ) + local_path = os.path.join( + "tests", "testcontent", "downloaded", "low_res_ogv_video.ogv" + ) download_fixture_file(source_url, local_path) assert os.path.exists(local_path) - f = open(local_path, 'rb') + f = open(local_path, "rb") f.close() return f # returns a closed file descriptor which we use for name attribute + @pytest.fixture def bad_video(): local_path = os.path.join("tests", "testcontent", "generated", "bad_video.mp4") if not os.path.exists(local_path): - with open(local_path, 'wb') as f: - f.write(b'novideohere. so ffmpeg should error out!') + with open(local_path, "wb") as f: + f.write(b"novideohere. so ffmpeg should error out!") f.flush() else: - f = open(local_path, 'rb') + f = open(local_path, "rb") f.close() return f # returns a closed file descriptor which we use for name attribute -def make_video_file(video_file_file, language='en'): +def make_video_file(video_file_file, language="en"): """ Creates a VideoFile object with path taken from `video_file_file.name`. """ @@ -95,44 +118,60 @@ def make_video_file(video_file_file, language='en'): class Test_video_processing_and_presets(object): - def setup_method(self): _clear_ricecookerfilecache() def test_basic_video_processing_low_res(self, low_res_video): - expected_video_filename = '897d83a2e5389d454d37feb574587516.mp4' + expected_video_filename = "897d83a2e5389d454d37feb574587516.mp4" video_file = make_video_file(low_res_video) video_filename = video_file.process_file() - assert video_filename == expected_video_filename, "Video file should have filename {}".format(expected_video_filename) + assert ( + video_filename == expected_video_filename + ), "Video file should have filename {}".format(expected_video_filename) video_path = config.get_storage_path(video_filename) - assert os.path.isfile(video_path), "Video should be stored at {}".format(video_path) - assert video_file.get_preset() == format_presets.VIDEO_LOW_RES, 'Should have low res preset' + assert os.path.isfile(video_path), "Video should be stored at {}".format( + video_path + ) + assert ( + video_file.get_preset() == format_presets.VIDEO_LOW_RES + ), "Should have low res preset" def test_basic_video_processing_low_res_webm(self, low_res_video_webm): - expected_video_filename = '5a2172860b2de19d746d00e3deeae3a7.webm' + expected_video_filename = "5a2172860b2de19d746d00e3deeae3a7.webm" video_file = make_video_file(low_res_video_webm) video_filename = video_file.process_file() - assert video_filename == expected_video_filename, "Video file should have filename {}".format(expected_video_filename) + assert ( + video_filename == expected_video_filename + ), "Video file should have filename {}".format(expected_video_filename) video_path = config.get_storage_path(video_filename) - assert os.path.isfile(video_path), "Video should be stored at {}".format(video_path) - assert video_file.get_preset() == format_presets.VIDEO_LOW_RES, 'Should have low res preset' + assert os.path.isfile(video_path), "Video should be stored at {}".format( + video_path + ) + assert ( + video_file.get_preset() == format_presets.VIDEO_LOW_RES + ), "Should have low res preset" def test_basic_video_processing_high_res(self, high_res_video): - expected_video_filename = 'e0ca22680786379362d0c95db2318853.mp4' + expected_video_filename = "e0ca22680786379362d0c95db2318853.mp4" video_file = make_video_file(high_res_video) video_filename = video_file.process_file() - assert video_filename == expected_video_filename, "Video file should have filename {}".format(expected_video_filename) - assert video_file.get_preset() == format_presets.VIDEO_HIGH_RES, 'Should have high res preset' + assert ( + video_filename == expected_video_filename + ), "Video file should have filename {}".format(expected_video_filename) + assert ( + video_file.get_preset() == format_presets.VIDEO_HIGH_RES + ), "Should have high res preset" def test_basic_video_processing_high_res_webm(self, high_res_video_webm): - expected_video_filename = '06b4e0d8c50f2224868086ad2fb92511.webm' + expected_video_filename = "06b4e0d8c50f2224868086ad2fb92511.webm" video_file = make_video_file(high_res_video_webm) video_filename = video_file.process_file() - assert video_filename == expected_video_filename, "Video file should have filename {}".format(expected_video_filename) - assert video_file.get_preset() == format_presets.VIDEO_HIGH_RES, 'Should have high res preset' - - - + assert ( + video_filename == expected_video_filename + ), "Video file should have filename {}".format(expected_video_filename) + assert ( + video_file.get_preset() == format_presets.VIDEO_HIGH_RES + ), "Should have high res preset" """ *********** TEST VIDEO COMPRESSION *********** """ @@ -140,90 +179,111 @@ def test_basic_video_processing_high_res_webm(self, high_res_video_webm): def get_resolution(videopath): """Helper function to get resolution of video at videopath.""" - result = subprocess.check_output(['ffprobe', '-v', 'error', '-print_format', 'json', '-show_entries', - 'stream=width,height', '-of', 'default=noprint_wrappers=1', str(videopath)]) - pattern = re.compile('width=([0-9]*)[^height]+height=([0-9]*)') + result = subprocess.check_output( + [ + "ffprobe", + "-v", + "error", + "-print_format", + "json", + "-show_entries", + "stream=width,height", + "-of", + "default=noprint_wrappers=1", + str(videopath), + ] + ) + pattern = re.compile("width=([0-9]*)[^height]+height=([0-9]*)") m = pattern.search(str(result)) width, height = int(m.group(1)), int(m.group(2)) return width, height -class Test_video_compression(object): +class Test_video_compression(object): def setup_method(self): _clear_ricecookerfilecache() def test_default_compression_works(self, high_res_video): video_file = make_video_file(high_res_video) - video_file.ffmpeg_settings = {'crf': 33} + video_file.ffmpeg_settings = {"crf": 33} video_filename = video_file.process_file() video_path = config.get_storage_path(video_filename) width, height = get_resolution(video_path) - assert height == 480, 'should compress to 480 v resolution by defualt' - assert video_file.get_preset() == format_presets.VIDEO_LOW_RES, 'Should have low res preset' - + assert height == 480, "should compress to 480 v resolution by defualt" + assert ( + video_file.get_preset() == format_presets.VIDEO_LOW_RES + ), "Should have low res preset" def test_compression_works(self, high_res_video): video_file = make_video_file(high_res_video) - video_file.ffmpeg_settings = {'crf': 33, 'max_height': 300} + video_file.ffmpeg_settings = {"crf": 33, "max_height": 300} video_filename = video_file.process_file() video_path = config.get_storage_path(video_filename) width, height = get_resolution(video_path) - assert height == 300, 'should be compress to 300 v resolution' - assert video_file.get_preset() == format_presets.VIDEO_LOW_RES, 'Should have low res preset' - + assert height == 300, "should be compress to 300 v resolution" + assert ( + video_file.get_preset() == format_presets.VIDEO_LOW_RES + ), "Should have low res preset" def test_compression_max_width_works(self, high_res_video): video_file = make_video_file(high_res_video) - video_file.ffmpeg_settings = {'crf': 33, 'max_width': 200} + video_file.ffmpeg_settings = {"crf": 33, "max_width": 200} video_filename = video_file.process_file() video_path = config.get_storage_path(video_filename) width, height = get_resolution(video_path) - assert width == 200, 'should be compress to 200 hz resolution' - assert video_file.get_preset() == format_presets.VIDEO_LOW_RES, 'Should have low res preset' - + assert width == 200, "should be compress to 200 hz resolution" + assert ( + video_file.get_preset() == format_presets.VIDEO_LOW_RES + ), "Should have low res preset" def test_handles_bad_file(self, bad_video): video_file = make_video_file(bad_video) - video_file.ffmpeg_settings = {'crf': 33} + video_file.ffmpeg_settings = {"crf": 33} video_filename = video_file.process_file() - assert video_filename == None, 'Should return None if trying to compress bad file' - assert "Invalid data" in str(video_file.error), 'File object should have error details' - assert video_file in config.FAILED_FILES, 'Video file sould be added to config.FAILED_FILES' - + assert ( + video_filename == None + ), "Should return None if trying to compress bad file" + assert "Invalid data" in str( + video_file.error + ), "File object should have error details" + assert ( + video_file in config.FAILED_FILES + ), "Video file sould be added to config.FAILED_FILES" +""" *********** TEST VIDEO CONVERSION *********** """ -""" *********** TEST VIDEO CONVERSION *********** """ class Test_video_conversion(object): - def setup_method(self): _clear_ricecookerfilecache() def test_convert_ogv_works(self, low_res_ogv_video): video_file = make_video_file(low_res_ogv_video) - video_file.ffmpeg_settings = {'crf': 33, 'max_height': 300} + video_file.ffmpeg_settings = {"crf": 33, "max_height": 300} video_filename = video_file.process_file() video_path = config.get_storage_path(video_filename) width, height = get_resolution(video_path) - assert height == 300, 'should be compress to 300 v resolution' - assert video_file.get_preset() == format_presets.VIDEO_LOW_RES, 'Should have low res preset' + assert height == 300, "should be compress to 300 v resolution" + assert ( + video_file.get_preset() == format_presets.VIDEO_LOW_RES + ), "Should have low res preset" def test_convert_and_resize_ogv_works(self, low_res_ogv_video): video_file = make_video_file(low_res_ogv_video) - video_file.ffmpeg_settings = {'crf': 33, 'max_height': 200} + video_file.ffmpeg_settings = {"crf": 33, "max_height": 200} video_filename = video_file.process_file() video_path = config.get_storage_path(video_filename) width, height = get_resolution(video_path) - assert height == 200, 'should be compress to 200 v resolution' - assert video_file.get_preset() == format_presets.VIDEO_LOW_RES, 'Should have low res preset' - - - + assert height == 200, "should be compress to 200 v resolution" + assert ( + video_file.get_preset() == format_presets.VIDEO_LOW_RES + ), "Should have low res preset" """ HELPER METHODS """ + def _clear_ricecookerfilecache(): """ Clear `.ricecookerfilecache` dir contents so each test runs in a clean env. @@ -241,29 +301,29 @@ def _clear_ricecookerfilecache(): print(e) - """ *********** TEST SUBTITLES CONVERSION *********** """ # see section SUBTITLEFILE TESTS in test_files.py - """ *********** TEST VIDEO FILE SUBS VALIDATION *********** """ + def test_multiple_subs_can_be_added(video_file): """ Baseline check to make sure we're not dropping subtitle files on validate. """ local_path = os.path.join("tests", "testcontent", "samples", "testsubtitles_ar.srt") assert os.path.exists(local_path) - video_node = VideoNode('vid-src-id', "Video", licenses.PUBLIC_DOMAIN) + video_node = VideoNode("vid-src-id", "Video", licenses.PUBLIC_DOMAIN) video_node.add_file(video_file) - sub1 = SubtitleFile(local_path, language='en') + sub1 = SubtitleFile(local_path, language="en") video_node.add_file(sub1) - sub2 = SubtitleFile(local_path, language='ar') + sub2 = SubtitleFile(local_path, language="ar") video_node.add_file(sub2) video_node.validate() sub_files = [f for f in video_node.files if isinstance(f, SubtitleFile)] - assert len(sub_files) == 2, 'Missing subtitles files!' + assert len(sub_files) == 2, "Missing subtitles files!" + def test_duplicate_language_codes_fixed_by_validate(video_file): """ @@ -271,13 +331,13 @@ def test_duplicate_language_codes_fixed_by_validate(video_file): """ local_path = os.path.join("tests", "testcontent", "samples", "testsubtitles_ar.srt") assert os.path.exists(local_path) - video_node = VideoNode('vid-src-id', "Video", licenses.PUBLIC_DOMAIN) + video_node = VideoNode("vid-src-id", "Video", licenses.PUBLIC_DOMAIN) video_node.add_file(video_file) - sub1 = SubtitleFile(local_path, language='ar') + sub1 = SubtitleFile(local_path, language="ar") video_node.add_file(sub1) # now let's add file with a duplicate language code... - sub2 = SubtitleFile(local_path, language='ar') + sub2 = SubtitleFile(local_path, language="ar") video_node.add_file(sub2) video_node.validate() sub_files = [f for f in video_node.files if isinstance(f, SubtitleFile)] - assert len(sub_files) == 1, 'Duplicate subtitles files not removed!' + assert len(sub_files) == 1, "Duplicate subtitles files not removed!" diff --git a/tests/test_youtube.py b/tests/test_youtube.py index 9fe82354..4d84a475 100644 --- a/tests/test_youtube.py +++ b/tests/test_youtube.py @@ -1,33 +1,48 @@ import os + import pytest -from ricecooker.utils.youtube import YouTubeVideoUtils, YouTubePlaylistUtils +from ricecooker.utils.youtube import YouTubePlaylistUtils +from ricecooker.utils.youtube import YouTubeVideoUtils """ *********** YouTube Cache FIXTURES *********** """ + @pytest.fixture def youtube_video_cache(): - cache_dir = os.path.join('tests', 'testcontent', 'youtubecache') - assert os.path.isdir(cache_dir), 'Incorrect directory path setting' - return YouTubeVideoUtils(id='zzJLYK893gQ', alias='test-video', cache_dir=cache_dir) + cache_dir = os.path.join("tests", "testcontent", "youtubecache") + assert os.path.isdir(cache_dir), "Incorrect directory path setting" + return YouTubeVideoUtils(id="zzJLYK893gQ", alias="test-video", cache_dir=cache_dir) + @pytest.fixture def youtube_playlist_cache(): - cache_dir = os.path.join('tests', 'testcontent', 'youtubecache') - assert os.path.isdir(cache_dir), 'Incorrect directory path setting' - return YouTubePlaylistUtils(id='PLOZioxrIwCv33zt5aFFjWqDoEMm55MVA9', alias='test-playlist', cache_dir=cache_dir) + cache_dir = os.path.join("tests", "testcontent", "youtubecache") + assert os.path.isdir(cache_dir), "Incorrect directory path setting" + return YouTubePlaylistUtils( + id="PLOZioxrIwCv33zt5aFFjWqDoEMm55MVA9", + alias="test-playlist", + cache_dir=cache_dir, + ) """ *********** YouTube Cache TESTS *********** """ + def test_youtube_video_cache(youtube_video_cache): - video_info = youtube_video_cache.get_video_info(use_proxy=False, get_subtitle_languages=True) - video_cache_filepath = os.path.join('tests', 'testcontent', 'youtubecache', 'test-video.json') + video_info = youtube_video_cache.get_video_info( + use_proxy=False, get_subtitle_languages=True + ) + video_cache_filepath = os.path.join( + "tests", "testcontent", "youtubecache", "test-video.json" + ) assert video_info and os.path.exists(video_cache_filepath) + def test_youtube_playlist_cache(youtube_playlist_cache): playlist_info = youtube_playlist_cache.get_playlist_info(use_proxy=False) - playlist_cache_filepath = os.path.join('tests', 'testcontent', 'youtubecache', 'test-playlist.json') + playlist_cache_filepath = os.path.join( + "tests", "testcontent", "youtubecache", "test-playlist.json" + ) assert playlist_info and os.path.exists(playlist_cache_filepath) - \ No newline at end of file diff --git a/tests/testchannels/csv_channel_with_exercises/Channel.csv b/tests/testchannels/csv_channel_with_exercises/Channel.csv index 601e7f43..25eae4de 100644 --- a/tests/testchannels/csv_channel_with_exercises/Channel.csv +++ b/tests/testchannels/csv_channel_with_exercises/Channel.csv @@ -1,2 +1,2 @@ Title,Description,Domain,Source ID,Language,Thumbnail -Test CSV channel with Exercises,This channel was created from the files in the channeldir/ directory and the metadata stored in CSV files,source.org,csv_channel_with_exercises,en,channeldir/channel_thumbnail.jpg \ No newline at end of file +Test CSV channel with Exercises,This channel was created from the files in the channeldir/ directory and the metadata stored in CSV files,source.org,csv_channel_with_exercises,en,channeldir/channel_thumbnail.jpg diff --git a/tests/testchannels/csv_channel_with_exercises/Content.csv b/tests/testchannels/csv_channel_with_exercises/Content.csv index a1063bcf..58a7941d 100644 --- a/tests/testchannels/csv_channel_with_exercises/Content.csv +++ b/tests/testchannels/csv_channel_with_exercises/Content.csv @@ -1,4 +1,4 @@ Path *,Title *,Source ID,Description,Author,Language,License ID *,License Description,Copyright Holder,Thumbnail channeldir/contentnodes,Content Nodes,3be352f9,Put folder description here,,en,,,, channeldir/contentnodes/audio,Audio Files,09219f2e,Put folder description here,,en,,,, -channeldir/exercises,Exercises,fafafa007,"This doesn’t contain any files, but will be populated with some of the exercises from Exercises.csv",First Last (author's name),en,CC BY,,Copyright holder name, \ No newline at end of file +channeldir/exercises,Exercises,fafafa007,"This doesn’t contain any files, but will be populated with some of the exercises from Exercises.csv",First Last (author's name),en,CC BY,,Copyright holder name, diff --git a/tests/testchannels/csv_channel_with_exercises/ExerciseQuestions.csv b/tests/testchannels/csv_channel_with_exercises/ExerciseQuestions.csv index 1d7c35c1..4cb19a3c 100644 --- a/tests/testchannels/csv_channel_with_exercises/ExerciseQuestions.csv +++ b/tests/testchannels/csv_channel_with_exercises/ExerciseQuestions.csv @@ -9,4 +9,4 @@ exrc3,1,single_selection,What is your 2+2?,1,2,3,4,5,,4,,,Add the two numbers to exrc4,2,single_selection,"What is the area of the circle shown below ![](figures/exrc4/circle-of-radius-2.png)",$\pi$,$2\pi$,$3\pi$,$4\pi$,$5\pi$,,$4\pi$,,,The area of a circle is proportional to the square of its radius.,The formula is $A=\pi r^2$.,In this case the circle has radius $r=2$ so the area of the circle is $A=4\pi$.,,, -exrc5,3,multiple_selection,Select all the triangles.,![](figures/exrc5/triangle1.png),![](figures/exrc5/hexagon.png),![](figures/exrc5/triangle2.png),![](figures/exrc5/triangle3.png),![](figures/exrc5/octagon.png),![](figures/exrc5/square.png),![](figures/exrc5/triangle1.png),![](figures/exrc5/triangle2.png),![](figures/exrc5/triangle3.png),A triangle is a geometrical shape with three sides and three vertices.,,,,, \ No newline at end of file +exrc5,3,multiple_selection,Select all the triangles.,![](figures/exrc5/triangle1.png),![](figures/exrc5/hexagon.png),![](figures/exrc5/triangle2.png),![](figures/exrc5/triangle3.png),![](figures/exrc5/octagon.png),![](figures/exrc5/square.png),![](figures/exrc5/triangle1.png),![](figures/exrc5/triangle2.png),![](figures/exrc5/triangle3.png),A triangle is a geometrical shape with three sides and three vertices.,,,,, diff --git a/tests/testchannels/csv_channel_with_exercises/Exercises.csv b/tests/testchannels/csv_channel_with_exercises/Exercises.csv index 903444d6..49fbcd6f 100644 --- a/tests/testchannels/csv_channel_with_exercises/Exercises.csv +++ b/tests/testchannels/csv_channel_with_exercises/Exercises.csv @@ -3,4 +3,4 @@ channeldir/exercise1,First Exercise,exrc1,This is a really math exercise that wi channeldir/contentnodes/audio/Wzexercise,Second Exercise,exrc2,An exrcise ,Ivan Savov,fr,CC BY,,Learning Equality,,,FALSE,channeldir/contentnodes/audio/WZ_exercise_thumbnail.png channeldir/exercises/exercise3,Third Exercise,exrc3,An exercise in the subfolder exercsies/,Ivan Savov,en,CC BY,,Learning Equality,,,FALSE, channeldir/exercises/exercise4,Fourth Exercise,exrc4,An exercise that shows figures in question and use of LaTeX markup,Ivan Savov,en,CC BY,,Learning Equality,,,FALSE, -channeldir/exercises/exercise5,Fifth Exercise,exrc5,An exercise which shows figures as answers,Ivan Savov,en,CC BY,,Learning Equality,,,FALSE, \ No newline at end of file +channeldir/exercises/exercise5,Fifth Exercise,exrc5,An exercise which shows figures as answers,Ivan Savov,en,CC BY,,Learning Equality,,,FALSE, diff --git a/tests/testcontent/exercises/eb3f3bf7c317408ee90995b5bcf4f3a59606aedd.svg b/tests/testcontent/exercises/eb3f3bf7c317408ee90995b5bcf4f3a59606aedd.svg index 1ad091aa..a88a8152 100644 --- a/tests/testcontent/exercises/eb3f3bf7c317408ee90995b5bcf4f3a59606aedd.svg +++ b/tests/testcontent/exercises/eb3f3bf7c317408ee90995b5bcf4f3a59606aedd.svg @@ -1 +1 @@ - \ No newline at end of file + diff --git a/tests/testcontent/samples/testsubtitles_ar.srt b/tests/testcontent/samples/testsubtitles_ar.srt index a4dc4ff5..36c9dcdc 100644 --- a/tests/testcontent/samples/testsubtitles_ar.srt +++ b/tests/testcontent/samples/testsubtitles_ar.srt @@ -9,4 +9,3 @@ 3 00:00:18,536 --> 00:00:22,119 وأحاول مساعدة الناس على زيادة حظهم. -