",
}
- """)
- raise ValueError('Missing credentials')
+ """
+ )
+ raise ValueError("Missing credentials")
studio_creds = json.load(open(STUDIO_CREDENTIALS))
#
# Studio API client (note currently needs both session auth and token as well)
api = StudioApi(
- token=studio_creds['token'],
- username=studio_creds['username'],
- password=studio_creds['password'],
- studio_url=studio_creds.get('studio_url', 'https://studio.learningequality.org')
+ token=studio_creds["token"],
+ username=studio_creds["username"],
+ password=studio_creds["password"],
+ studio_url=studio_creds.get(
+ "studio_url", "https://studio.learningequality.org"
+ ),
)
return api
def export_corrections_csv(args):
api = get_studio_api()
- channel_tree = get_channel_tree(api, args.channel_id, suffix='-export')
+ channel_tree = get_channel_tree(api, args.channel_id, suffix="-export")
print_channel_tree(channel_tree)
csvexporter = CorretionsCsvFileExporter()
csvexporter.export_channel_tree_as_corrections_csv(channel_tree)
@@ -590,69 +631,88 @@ def export_corrections_csv(args):
def apply_corrections(args):
# 1. LOAD Studio channel_tree (needed for lookups by node_id, content_id, etc.)
api = get_studio_api()
- channel_tree = get_channel_tree(api, args.channel_id, suffix='-before')
+ channel_tree = get_channel_tree(api, args.channel_id, suffix="-before")
#
# 2. IMPORT the corrections from the Spreadsheet
- csvfilepath = 'corrections-import.csv'
+ csvfilepath = "corrections-import.csv"
save_gsheet_to_local_csv(args.gsheet_id, args.gid, csvfilepath=csvfilepath)
#
# 3. TRANSFORM corrections-import.csv to Studio detailed diff format
- modifyattrs = args.modifyattrs.split(',') # using only selected attributes
+ modifyattrs = args.modifyattrs.split(",") # using only selected attributes
correctionspath = get_corrections_by_node_id(csvfilepath, modifyattrs)
#
# Special case: when export was performed on source channel, but we want to
# apply the corrections to a cloned channel. In that cases, the `Node ID`
# column in the CSV corresponds to the `original_source_node_id` attribute
# of the nodes in the derivative channel so we must do a remapping:
- if args.primarykey == 'original_source_node_id':
+ if args.primarykey == "original_source_node_id":
corrections_by_original_source_node_id = json.load(open(correctionspath))
- corrections_by_node_id = remap_original_source_node_id_to_node_id(channel_tree, corrections_by_original_source_node_id)
- json.dump(corrections_by_node_id, open(correctionspath, 'w'), indent=4, ensure_ascii=False, sort_keys=True)
- print('Finished original_source_node_id-->node_id lookup and remapping.')
- elif args.primarykey in ['content_id', 'studio_id']:
- raise NotImplementedError('Using content_id and studio_id not ready yet.')
+ corrections_by_node_id = remap_original_source_node_id_to_node_id(
+ channel_tree, corrections_by_original_source_node_id
+ )
+ json.dump(
+ corrections_by_node_id,
+ open(correctionspath, "w"),
+ indent=4,
+ ensure_ascii=False,
+ sort_keys=True,
+ )
+ print("Finished original_source_node_id-->node_id lookup and remapping.")
+ elif args.primarykey in ["content_id", "studio_id"]:
+ raise NotImplementedError("Using content_id and studio_id not ready yet.")
#
# Early exit if running the `importonly` command
- if args.command == 'importonly':
- print('Corrections json file imported. See', correctionspath)
+ if args.command == "importonly":
+ print("Corrections json file imported. See", correctionspath)
return correctionspath
#
# 4. LOAD corrections.json (four lists of corrections organized by nod_id)
corrections_by_node_id = json.load(open(correctionspath))
#
# 5. Apply the corrections
- apply_corrections_by_node_id(api, channel_tree, args.channel_id, corrections_by_node_id)
+ apply_corrections_by_node_id(
+ api, channel_tree, args.channel_id, corrections_by_node_id
+ )
#
# 6. SAVE the Studio tree after corrections for review of what was changed
- channel_tree = get_channel_tree(api, args.channel_id, suffix='-after')
-
+ channel_tree = get_channel_tree(api, args.channel_id, suffix="-after")
def correctionsmain():
"""
Command line interface for applying bulk-edit corrections:
"""
- parser = argparse.ArgumentParser(description='Bulk channel edits via CSV/sheets.')
- parser.add_argument('command', help='One of export|importonly|apply',
- choices=['export', 'importonly', 'apply'])
- parser.add_argument('channel_id', help='The studio Channel ID to edit')
- parser.add_argument('--primarykey', help='Which idendifier to use when looking up nodes',
- choices=['node_id', 'content_id', 'original_source_node_id', 'studio_id'],
- default='node_id')
- parser.add_argument('--gsheet_id', help='Google spreadsheets sheet ID (public)')
- parser.add_argument('--gid', help='The gid argument to indicate which sheet', default='0')
- parser.add_argument('--modifyattrs', help='Which attributes to modify',
- default='title,description,author,copyright_holder')
+ parser = argparse.ArgumentParser(description="Bulk channel edits via CSV/sheets.")
+ parser.add_argument(
+ "command",
+ help="One of export|importonly|apply",
+ choices=["export", "importonly", "apply"],
+ )
+ parser.add_argument("channel_id", help="The studio Channel ID to edit")
+ parser.add_argument(
+ "--primarykey",
+ help="Which idendifier to use when looking up nodes",
+ choices=["node_id", "content_id", "original_source_node_id", "studio_id"],
+ default="node_id",
+ )
+ parser.add_argument("--gsheet_id", help="Google spreadsheets sheet ID (public)")
+ parser.add_argument(
+ "--gid", help="The gid argument to indicate which sheet", default="0"
+ )
+ parser.add_argument(
+ "--modifyattrs",
+ help="Which attributes to modify",
+ default="title,description,author,copyright_holder",
+ )
args = parser.parse_args()
# print("in corrections.main with cliargs", args)
- if args.command == 'export':
+ if args.command == "export":
export_corrections_csv(args)
- elif args.command in ['importonly', 'apply']:
+ elif args.command in ["importonly", "apply"]:
apply_corrections(args)
else:
- raise ValueError('Unrecognized command')
+ raise ValueError("Unrecognized command")
-if __name__ == '__main__':
+if __name__ == "__main__":
correctionsmain()
-
diff --git a/ricecooker/utils/downloader.py b/ricecooker/utils/downloader.py
index faa7e5f1..668681eb 100644
--- a/ricecooker/utils/downloader.py
+++ b/ricecooker/utils/downloader.py
@@ -4,42 +4,49 @@
import mimetypes
import os
import re
-import requests
+import selenium.webdriver.support.ui as selenium_ui
import shutil
import tempfile
import time
-from urllib.parse import urlparse, urljoin
-from urllib.request import url2pathname
import uuid
+from selenium import webdriver
+from urllib.parse import urljoin
+from urllib.parse import urlparse
+from urllib.request import url2pathname
import chardet
-
+import requests
from bs4 import BeautifulSoup
-from selenium import webdriver
-import selenium.webdriver.support.ui as selenium_ui
from requests_file import FileAdapter
-from ricecooker.config import LOGGER, PHANTOMJS_PATH, STRICT
-from ricecooker.utils.html import download_file, replace_links
-from ricecooker.utils.caching import CacheForeverHeuristic, FileCache, CacheControlAdapter, InvalidatingCacheControlAdapter
+
+from ricecooker.config import LOGGER
+from ricecooker.config import PHANTOMJS_PATH
+from ricecooker.config import STRICT
+from ricecooker.utils.caching import CacheControlAdapter
+from ricecooker.utils.caching import CacheForeverHeuristic
+from ricecooker.utils.caching import FileCache
+from ricecooker.utils.caching import InvalidatingCacheControlAdapter
+from ricecooker.utils.html import download_file
+from ricecooker.utils.html import replace_links
from ricecooker.utils.zip import create_predictable_zip
-DOWNLOAD_SESSION = requests.Session() # Session for downloading content from urls
-DOWNLOAD_SESSION.mount('https://', requests.adapters.HTTPAdapter(max_retries=3))
-DOWNLOAD_SESSION.mount('file://', FileAdapter())
+DOWNLOAD_SESSION = requests.Session() # Session for downloading content from urls
+DOWNLOAD_SESSION.mount("https://", requests.adapters.HTTPAdapter(max_retries=3))
+DOWNLOAD_SESSION.mount("file://", FileAdapter())
# use_dir_lock works with all filesystems and OSes
-cache = FileCache('.webcache', use_dir_lock=True)
-forever_adapter= CacheControlAdapter(heuristic=CacheForeverHeuristic(), cache=cache)
+cache = FileCache(".webcache", use_dir_lock=True)
+forever_adapter = CacheControlAdapter(heuristic=CacheForeverHeuristic(), cache=cache)
# we can't use requests caching for pyppeteer / phantomjs, so track those separately.
downloaded_pages = {}
-DOWNLOAD_SESSION.mount('http://', forever_adapter)
-DOWNLOAD_SESSION.mount('https://', forever_adapter)
+DOWNLOAD_SESSION.mount("http://", forever_adapter)
+DOWNLOAD_SESSION.mount("https://", forever_adapter)
DEFAULT_HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:20.0) Gecko/20100101 Firefox/20.0",
"Accept-Encoding": "gzip, deflate",
- "Connection": "keep-alive"
+ "Connection": "keep-alive",
}
@@ -55,20 +62,34 @@
from pyppeteer import launch, errors
async def load_page(path, timeout=30, strict=True):
- browser = await launch({'headless': True})
+ browser = await launch({"headless": True})
content = None
cookies = None
page = None
try:
page = await browser.newPage()
try:
- await page.goto(path, {'timeout': timeout * 1000, 'waitUntil': ['load', 'domcontentloaded', 'networkidle0']})
+ await page.goto(
+ path,
+ {
+ "timeout": timeout * 1000,
+ "waitUntil": ["load", "domcontentloaded", "networkidle0"],
+ },
+ )
except errors.TimeoutError:
# some sites have API calls running regularly, so the timeout may be that there's never any true
# network idle time. Try 'networkidle2' option instead before determining we can't scrape.
if not strict:
- LOGGER.info("Attempting to download URL with networkidle2 instead of networkidle0...")
- await page.goto(path, {'timeout': timeout * 1000, 'waitUntil': ['load', 'domcontentloaded', 'networkidle2']})
+ LOGGER.info(
+ "Attempting to download URL with networkidle2 instead of networkidle0..."
+ )
+ await page.goto(
+ path,
+ {
+ "timeout": timeout * 1000,
+ "waitUntil": ["load", "domcontentloaded", "networkidle2"],
+ },
+ )
else:
raise
# get the entire rendered page, including the doctype
@@ -78,36 +99,53 @@ async def load_page(path, timeout=30, strict=True):
LOGGER.warning("Error scraping page: {}".format(e))
finally:
await browser.close()
- return content, {'cookies': cookies, 'url': path}
+ return content, {"cookies": cookies, "url": path}
async def take_screenshot(url, filename, element=None, timeout=30):
- browser = await launch({'headless': True})
+ browser = await launch({"headless": True})
try:
page = await browser.newPage()
- await page.goto(url,
- {'timeout': timeout * 1000, 'waitUntil': ['load', 'domcontentloaded', 'networkidle0']})
+ await page.goto(
+ url,
+ {
+ "timeout": timeout * 1000,
+ "waitUntil": ["load", "domcontentloaded", "networkidle0"],
+ },
+ )
screenshot_element = page
if element:
- await page.waitForSelector(element, {'timeout': 10000})
+ await page.waitForSelector(element, {"timeout": 10000})
elements = await page.querySelectorAll(element)
if len(list(elements)) > 1:
- LOGGER.warning("Multiple elements matched screenshot element, using first...")
+ LOGGER.warning(
+ "Multiple elements matched screenshot element, using first..."
+ )
screenshot_element = elements[0]
LOGGER.info("Saving screenshot to {}".format(filename))
- await screenshot_element.screenshot({'path': filename})
+ await screenshot_element.screenshot({"path": filename})
finally:
await page.close()
await browser.close()
+
USE_PYPPETEER = True
except:
print("Unable to load pyppeteer, using phantomjs for JS loading.")
pass
-def read(path, loadjs=False, session=None, driver=None, timeout=60,
- clear_cookies=True, loadjs_wait_time=3, loadjs_wait_for_callback=None, strict=True):
+def read(
+ path,
+ loadjs=False,
+ session=None,
+ driver=None,
+ timeout=60,
+ clear_cookies=True,
+ loadjs_wait_time=3,
+ loadjs_wait_for_callback=None,
+ strict=True,
+):
"""Reads from source and returns contents
Args:
@@ -133,7 +171,7 @@ def read(path, loadjs=False, session=None, driver=None, timeout=60,
session = session or DOWNLOAD_SESSION
try:
- if loadjs: # Wait until js loads then return contents
+ if loadjs: # Wait until js loads then return contents
if USE_PYPPETEER:
content = asyncio.get_event_loop().run_until_complete(load_page(path))
return content
@@ -148,17 +186,21 @@ def read(path, loadjs=False, session=None, driver=None, timeout=60,
time.sleep(loadjs_wait_time)
return driver.page_source
- else: # Read page contents from url
+ else: # Read page contents from url
response = make_request(path, clear_cookies, session=session)
return response.content
except (requests.exceptions.MissingSchema, requests.exceptions.InvalidSchema):
- with open(path, 'rb') as fobj: # If path is a local file path, try to open the file
+ with open(
+ path, "rb"
+ ) as fobj: # If path is a local file path, try to open the file
return fobj.read()
-def make_request(url, clear_cookies=False, headers=None, timeout=60, session=None, *args, **kwargs):
+def make_request(
+ url, clear_cookies=False, headers=None, timeout=60, session=None, *args, **kwargs
+):
sess = session or DOWNLOAD_SESSION
if clear_cookies:
@@ -173,16 +215,33 @@ def make_request(url, clear_cookies=False, headers=None, timeout=60, session=Non
while retry_count <= max_retries:
try:
- response = sess.get(url, headers=request_headers, stream=True, timeout=timeout, *args, **kwargs)
+ response = sess.get(
+ url,
+ headers=request_headers,
+ stream=True,
+ timeout=timeout,
+ *args,
+ **kwargs
+ )
if response.status_code != 200:
- LOGGER.error("{} error while trying to download {}".format(response.status_code, url))
+ LOGGER.error(
+ "{} error while trying to download {}".format(
+ response.status_code, url
+ )
+ )
if STRICT:
response.raise_for_status()
return response
- except (requests.exceptions.ConnectionError, requests.exceptions.ReadTimeout) as e:
+ except (
+ requests.exceptions.ConnectionError,
+ requests.exceptions.ReadTimeout,
+ ) as e:
retry_count += 1
- LOGGER.warning("Error with connection ('{msg}'); about to perform retry {count} of {trymax}."
- .format(msg=str(e), count=retry_count, trymax=max_retries))
+ LOGGER.warning(
+ "Error with connection ('{msg}'); about to perform retry {count} of {trymax}.".format(
+ msg=str(e), count=retry_count, trymax=max_retries
+ )
+ )
time.sleep(retry_count * 1)
if retry_count > max_retries:
LOGGER.error("Could not connect to: {}".format(url))
@@ -196,16 +255,26 @@ def make_request(url, clear_cookies=False, headers=None, timeout=60, session=Non
# TODO(davidhu): Use MD5 hash of URL (ideally file) instead.
def _derive_filename(url):
- name = os.path.basename(urlparse(url).path).replace('%', '_')
+ name = os.path.basename(urlparse(url).path).replace("%", "_")
return ("%s.%s" % (uuid.uuid4().hex, name)).lower()
# TODO: The number of args and inner functions in this strongly suggest this needs
# to be a class or have its functionality separated out.
-def download_static_assets(doc, destination, base_url,
- request_fn=make_request, url_blacklist=[], js_middleware=None,
- css_middleware=None, derive_filename=_derive_filename, link_policy=None,
- run_js=False, resource_urls=None, relative_links=False):
+def download_static_assets(
+ doc,
+ destination,
+ base_url,
+ request_fn=make_request,
+ url_blacklist=[],
+ js_middleware=None,
+ css_middleware=None,
+ derive_filename=_derive_filename,
+ link_policy=None,
+ run_js=False,
+ resource_urls=None,
+ relative_links=False,
+):
"""
Download all static assets referenced from an HTML page.
The goal is to easily create HTML5 apps! Downloads JS, CSS, images, and
@@ -231,8 +300,8 @@ def download_static_assets(doc, destination, base_url,
extract the raw HTML.)
"""
# without the ending /, some functions will treat the last path component like a filename, so add it.
- if not base_url.endswith('/'):
- base_url += '/'
+ if not base_url.endswith("/"):
+ base_url += "/"
LOGGER.debug("base_url = {}".format(base_url))
@@ -244,7 +313,7 @@ def download_srcset(selector, attr, content_middleware=None):
for i, node in enumerate(nodes):
srcset = node[attr]
- sources = srcset.split(',')
+ sources = srcset.split(",")
new_sources = []
for source in sources:
# a source can be just a URL, or a URL + a space character and then a width or resolution.
@@ -254,22 +323,30 @@ def download_srcset(selector, attr, content_middleware=None):
new_url = filename
if relative_links and base_url:
base_filename = derive_filename(base_url)
- new_url = get_relative_url_for_archive_filename(filename, base_filename)
+ new_url = get_relative_url_for_archive_filename(
+ filename, base_filename
+ )
fullpath = os.path.join(destination, filename)
if not os.path.exists(fullpath):
LOGGER.info("Downloading {} to filename {}".format(url, fullpath))
- download_file(url, destination, request_fn=request_fn,
- filename=filename, middleware_callbacks=content_middleware)
+ download_file(
+ url,
+ destination,
+ request_fn=request_fn,
+ filename=filename,
+ middleware_callbacks=content_middleware,
+ )
if len(parts) > 1:
- new_sources.append(" ".join([new_url, parts[1]]))
+ new_sources.append(" ".join([new_url, parts[1]]))
else:
new_sources.append(new_url)
- node[attr] = ', '.join(new_sources)
+ node[attr] = ", ".join(new_sources)
# Helper function to download all assets for a given CSS selector.
- def download_assets(selector, attr, url_middleware=None,
- content_middleware=None, node_filter=None):
+ def download_assets(
+ selector, attr, url_middleware=None, content_middleware=None, node_filter=None
+ ):
nodes = doc.select(selector)
for i, node in enumerate(nodes):
@@ -277,17 +354,17 @@ def download_assets(selector, attr, url_middleware=None,
if node_filter:
if not node_filter(node):
src = node[attr]
- node[attr] = ''
- print(' Skipping node with src ', src)
+ node[attr] = ""
+ print(" Skipping node with src ", src)
continue
- if node[attr].startswith('data:'):
+ if node[attr].startswith("data:"):
continue
url = urljoin(base_url, node[attr])
if _is_blacklisted(url, url_blacklist):
- LOGGER.info(' Skipping downloading blacklisted url', url)
+ LOGGER.info(" Skipping downloading blacklisted url", url)
node[attr] = ""
continue
@@ -300,13 +377,13 @@ def download_assets(selector, attr, url_middleware=None,
# This COULD be an index file in a dir, or just a file with no extension. Handle either case by
# turning the path into filename + '/index' + the file extension from the content type
response = requests.get(url)
- type = response.headers['content-type'].split(';')[0]
+ type = response.headers["content-type"].split(";")[0]
ext = mimetypes.guess_extension(type)
# if we're really stuck, just default to HTML as that is most likely if this is a redirect.
if not ext:
- ext = '.html'
+ ext = ".html"
subpath = os.path.dirname(filename)
- filename = 'index{}'.format(ext)
+ filename = "index{}".format(ext)
os.makedirs(os.path.join(destination, subpath), exist_ok=True)
@@ -319,21 +396,25 @@ def download_assets(selector, attr, url_middleware=None,
fullpath = os.path.join(destination, filename)
if not os.path.exists(fullpath):
LOGGER.info("Downloading {} to filename {}".format(url, fullpath))
- download_file(url, destination, request_fn=request_fn,
- filename=filename, middleware_callbacks=content_middleware)
+ download_file(
+ url,
+ destination,
+ request_fn=request_fn,
+ filename=filename,
+ middleware_callbacks=content_middleware,
+ )
elif content_middleware:
# Make sure we run middleware, as it creates a list of file dependencies that we need when
# converting the content into a zip file.
# TODO: We should probably separate out the download step from the middleware step, so
# that middleware can be run regardless of how we get the content.
- content = open(fullpath, 'r', encoding='utf-8').read()
+ content = open(fullpath, "r", encoding="utf-8").read()
new_content = content_middleware(content, url)
if new_content != content:
# if the middleware changed the content, update it.
- with open(fullpath, 'w') as f:
+ with open(fullpath, "w") as f:
f.write(new_content)
-
def js_content_middleware(content, url, **kwargs):
if js_middleware:
content = js_middleware(content, url, **kwargs)
@@ -354,35 +435,39 @@ def css_content_middleware(content, url, **kwargs):
def repl(match):
src = match.group(1)
- if src.startswith('//localhost'):
- return 'url()'
+ if src.startswith("//localhost"):
+ return "url()"
# Don't download data: files
- if src.startswith('data:'):
+ if src.startswith("data:"):
return match.group(0)
parts = urlparse(src)
root_url = None
if url:
- root_url = url[:url.rfind('/') + 1]
+ root_url = url[: url.rfind("/") + 1]
if parts.scheme and parts.netloc:
src_url = src
- elif parts.path.startswith('/') and url:
- src_url = '{}://{}{}'.format(root_parts.scheme, root_parts.netloc, parts.path)
+ elif parts.path.startswith("/") and url:
+ src_url = "{}://{}{}".format(
+ root_parts.scheme, root_parts.netloc, parts.path
+ )
elif url and root_url:
src_url = urljoin(root_url, src)
else:
src_url = urljoin(base_url, src)
if _is_blacklisted(src_url, url_blacklist):
- print(' Skipping downloading blacklisted url', src_url)
- return 'url()'
+ print(" Skipping downloading blacklisted url", src_url)
+ return "url()"
derived_filename = derive_filename(src_url)
new_url = src
- if url and parts.path.startswith('/') or relative_links:
+ if url and parts.path.startswith("/") or relative_links:
page_filename = derive_filename(url)
- new_url = get_relative_url_for_archive_filename(derived_filename, page_filename)
+ new_url = get_relative_url_for_archive_filename(
+ derived_filename, page_filename
+ )
elif derive_filename == _derive_filename:
# The _derive_filename function puts all files in the root, so all URLs need
# rewritten. When using get_archive_filename, relative URLs will still work.
@@ -390,10 +475,16 @@ def repl(match):
fullpath = os.path.join(destination, derived_filename)
if not os.path.exists(fullpath):
- download_file(src_url, destination, request_fn=request_fn,
- filename=derived_filename)
+ download_file(
+ src_url,
+ destination,
+ request_fn=request_fn,
+ filename=derived_filename,
+ )
else:
- LOGGER.debug("Resource already downloaded, skipping: {}".format(src_url))
+ LOGGER.debug(
+ "Resource already downloaded, skipping: {}".format(src_url)
+ )
return 'url("%s")' % new_url
return _CSS_URL_RE.sub(repl, content)
@@ -401,45 +492,51 @@ def repl(match):
# Download all linked static assets.
download_assets("img[src]", "src") # Images
download_srcset("img[srcset]", "srcset") # Images
- download_assets("link[href]", "href",
- content_middleware=css_content_middleware,
- node_filter=css_node_filter) # CSS
- download_assets("script[src]", "src",
- content_middleware=js_content_middleware) # JS
- download_assets("source[src]", "src") # Potentially audio
- download_srcset("source[srcset]", "srcset") # Potentially audio
+ download_assets(
+ "link[href]",
+ "href",
+ content_middleware=css_content_middleware,
+ node_filter=css_node_filter,
+ ) # CSS
+ download_assets(
+ "script[src]", "src", content_middleware=js_content_middleware
+ ) # JS
+ download_assets("source[src]", "src") # Potentially audio
+ download_srcset("source[srcset]", "srcset") # Potentially audio
# Link scraping can be expensive, so it's off by default. We decrement the levels value every time we recurse
# so skip once we hit zero.
- if link_policy is not None and link_policy['levels'] > 0:
+ if link_policy is not None and link_policy["levels"] > 0:
nodes = doc.select("iframe[src]")
nodes += doc.select("a[href]")
# TODO: add "a[href]" handling to this and/or ways to whitelist / blacklist tags and urls
for node in nodes:
url = None
- if node.name == 'iframe':
- url = node['src']
- elif node.name == 'a':
- url = node['href']
+ if node.name == "iframe":
+ url = node["src"]
+ elif node.name == "a":
+ url = node["href"]
assert url is not None
- download_url = url.split('#')[0] # Ignore bookmarks in URL
+ download_url = url.split("#")[0] # Ignore bookmarks in URL
if download_url.strip() == "":
continue
parts = urlparse(download_url)
# if we're scraping links, always scrape relative links regardless of setting.
- should_scrape = 'all' in link_policy['scope'] or (not parts.scheme and not parts.netloc)
- if not parts.scheme or parts.scheme.startswith('http'):
+ should_scrape = "all" in link_policy["scope"] or (
+ not parts.scheme and not parts.netloc
+ )
+ if not parts.scheme or parts.scheme.startswith("http"):
LOGGER.debug("checking url: {}".format(url))
if not parts.netloc:
download_url = urljoin(base_url, download_url)
- if 'whitelist' in link_policy:
- for whitelist_item in link_policy['whitelist']:
+ if "whitelist" in link_policy:
+ for whitelist_item in link_policy["whitelist"]:
if whitelist_item in download_url:
should_scrape = True
break
- if 'blacklist' in link_policy:
- for blacklist_item in link_policy['blacklist']:
+ if "blacklist" in link_policy:
+ for blacklist_item in link_policy["blacklist"]:
if blacklist_item in download_url:
should_scrape = False
break
@@ -447,22 +544,41 @@ def repl(match):
if should_scrape:
policy = copy.copy(link_policy)
# make sure we reduce the depth level by one each time we recurse
- policy['levels'] -= 1
+ policy["levels"] -= 1
# no extension is most likely going to return HTML as well.
- is_html = os.path.splitext(download_url)[1] in ['.htm', '.html', '.xhtml', '']
+ is_html = os.path.splitext(download_url)[1] in [
+ ".htm",
+ ".html",
+ ".xhtml",
+ "",
+ ]
derived_filename = derive_filename(download_url)
new_url = derived_filename
if is_html:
if not download_url in downloaded_pages:
- LOGGER.info("Downloading linked HTML page {}".format(download_url))
+ LOGGER.info(
+ "Downloading linked HTML page {}".format(download_url)
+ )
global archiver
if archiver:
- info = archiver.get_page(download_url, link_policy=policy, run_js=run_js)
- filename = info['index_path'].replace(archiver.root_dir + os.sep, '')
+ info = archiver.get_page(
+ download_url, link_policy=policy, run_js=run_js
+ )
+ filename = info["index_path"].replace(
+ archiver.root_dir + os.sep, ""
+ )
else:
- info = archive_page(download_url, destination, link_policy=policy, run_js=run_js, relative_links=relative_links)
- filename = info['index_path'].replace(destination + os.sep, '')
+ info = archive_page(
+ download_url,
+ destination,
+ link_policy=policy,
+ run_js=run_js,
+ relative_links=relative_links,
+ )
+ filename = info["index_path"].replace(
+ destination + os.sep, ""
+ )
new_url = filename
downloaded_pages[download_url] = new_url
@@ -475,7 +591,9 @@ def repl(match):
if relative_links and base_url:
page_filename = derive_filename(base_url)
- new_url = get_relative_url_for_archive_filename(new_url, page_filename)
+ new_url = get_relative_url_for_archive_filename(
+ new_url, page_filename
+ )
else:
full_path = os.path.join(destination, derived_filename)
new_url = derived_filename
@@ -485,19 +603,19 @@ def repl(match):
else:
LOGGER.info("File already downloaded, skipping: {}".format(url))
- if node.name == 'iframe':
- node['src'] = new_url
- elif node.name == 'a':
- node['href'] = new_url
+ if node.name == "iframe":
+ node["src"] = new_url
+ elif node.name == "a":
+ node["href"] = new_url
# ... and also run the middleware on CSS/JS embedded in the page source to
# get linked files.
- for node in doc.select('style'):
- node.string = css_content_middleware(node.get_text(), url='')
+ for node in doc.select("style"):
+ node.string = css_content_middleware(node.get_text(), url="")
- for node in doc.select('script'):
- if not node.attrs.get('src'):
- node.string = js_content_middleware(node.get_text(), url='')
+ for node in doc.select("script"):
+ if not node.attrs.get("src"):
+ node.string = js_content_middleware(node.get_text(), url="")
return doc
@@ -526,14 +644,18 @@ def get_archive_filename(url, page_url=None, download_root=None, resource_urls=N
if file_url_parsed.query:
# Append the query to the filename, so that the filename is unique for each set of params.
- query_string = "_{}".format(file_url_parsed.query.replace('=', '_').replace('&', '_'))
+ query_string = "_{}".format(
+ file_url_parsed.query.replace("=", "_").replace("&", "_")
+ )
local_path = _path + query_string + ext
LOGGER.debug("local_path is now {}".format(local_path))
local_dir_name = local_path
- if ext != '':
+ if ext != "":
local_dir_name = os.path.dirname(local_path)
- LOGGER.debug("local_path = {}, local_dir_name = {}".format(local_path, local_dir_name))
+ LOGGER.debug(
+ "local_path = {}, local_dir_name = {}".format(local_path, local_dir_name)
+ )
if local_dir_name != local_path and resource_urls is not None:
full_dir = os.path.join(download_root, local_dir_name)
@@ -543,19 +665,26 @@ def get_archive_filename(url, page_url=None, download_root=None, resource_urls=N
# Right now, this code depends on any file links having an extension, as in this function
# we don't know the mimetype of the resource yet. We should probably pass in mimetype to this
# function so we can construct filenames for extensionless URLs.
- if os.path.splitext(local_path)[1].strip() != '':
+ if os.path.splitext(local_path)[1].strip() != "":
LOGGER.debug("replacing {} with {}".format(url, local_path))
resource_urls[url] = local_path
return local_path
def get_relative_url_for_archive_filename(filename, relative_to):
- if os.path.isfile(relative_to) or os.path.splitext(relative_to)[1] != '':
+ if os.path.isfile(relative_to) or os.path.splitext(relative_to)[1] != "":
relative_to = os.path.dirname(relative_to)
return os.path.relpath(filename, relative_to).replace("\\", "/")
-def archive_page(url, download_root, link_policy=None, run_js=False, strict=False, relative_links=False):
+def archive_page(
+ url,
+ download_root,
+ link_policy=None,
+ run_js=False,
+ strict=False,
+ relative_links=False,
+):
"""
Download fully rendered page and all related assets into ricecooker's site archive format.
@@ -571,27 +700,35 @@ def archive_page(url, download_root, link_policy=None, run_js=False, strict=Fals
os.makedirs(download_root, exist_ok=True)
if run_js:
- content, props = asyncio.get_event_loop().run_until_complete(load_page(url, strict=strict))
+ content, props = asyncio.get_event_loop().run_until_complete(
+ load_page(url, strict=strict)
+ )
else:
response = make_request(url)
- props = {'cookies': requests.utils.dict_from_cookiejar(response.cookies), 'url': response.url}
- if not 'charset' in response.headers['Content-Type']:
+ props = {
+ "cookies": requests.utils.dict_from_cookiejar(response.cookies),
+ "url": response.url,
+ }
+ if not "charset" in response.headers["Content-Type"]:
# It seems requests defaults to ISO-8859-1 when the headers don't explicitly declare an
# encoding. In this case, we're better off using chardet to guess instead.
encoding = chardet.detect(response.content)
- if encoding and 'encoding' in encoding:
- response.encoding = encoding['encoding']
+ if encoding and "encoding" in encoding:
+ response.encoding = encoding["encoding"]
LOGGER.warning("Encoding = {}".format(response.encoding))
content = response.text
# url may be redirected, for relative link handling we want the final URL that was loaded.
- url = props['url']
+ url = props["url"]
# get related assets
parts = urlparse(url)
if not parts.scheme:
- parts.scheme = 'https'
- base_url = urljoin("{}://{}".format(parts.scheme, parts.netloc), parts.path[:parts.path.rfind('/')])
+ parts.scheme = "https"
+ base_url = urljoin(
+ "{}://{}".format(parts.scheme, parts.netloc),
+ parts.path[: parts.path.rfind("/")],
+ )
resource_urls = {}
if content:
@@ -600,40 +737,55 @@ def archive_page(url, download_root, link_policy=None, run_js=False, strict=Fals
def get_resource_filename(url):
return get_archive_filename(url, page_url, download_root, resource_urls)
- doc = download_static_assets(content, download_root, base_url, derive_filename=get_resource_filename,
- link_policy=link_policy, run_js=run_js, resource_urls=resource_urls,
- relative_links=relative_links)
- download_path = os.path.join(download_root, get_archive_filename(url, page_url, download_root))
+ doc = download_static_assets(
+ content,
+ download_root,
+ base_url,
+ derive_filename=get_resource_filename,
+ link_policy=link_policy,
+ run_js=run_js,
+ resource_urls=resource_urls,
+ relative_links=relative_links,
+ )
+
+ download_path = os.path.join(
+ download_root, get_archive_filename(url, page_url, download_root)
+ )
_path, ext = os.path.splitext(download_path)
index_path = download_path
- if '.htm' not in ext:
- if page_url.endswith('/'):
- index_path = download_path + 'index.html'
+ if ".htm" not in ext:
+ if page_url.endswith("/"):
+ index_path = download_path + "index.html"
else:
- index_path = download_path + '.html'
+ index_path = download_path + ".html"
index_dir = os.path.dirname(index_path)
new_content = doc.prettify()
# Replace any links with relative links that we haven't changed already.
# TODO: Find a way to determine when this check is no longer needed.
- new_content = replace_links(new_content, resource_urls, download_root, index_dir, relative_links=relative_links)
+ new_content = replace_links(
+ new_content,
+ resource_urls,
+ download_root,
+ index_dir,
+ relative_links=relative_links,
+ )
os.makedirs(index_dir, exist_ok=True)
- soup = BeautifulSoup(new_content, features='lxml')
- f = open(index_path, 'wb')
+ soup = BeautifulSoup(new_content, features="lxml")
+ f = open(index_path, "wb")
f.write(soup.prettify(encoding="utf-8"))
f.close()
-
page_info = {
- 'url': url,
- 'cookies': props['cookies'],
- 'index_path': index_path,
- 'resources': list(resource_urls.values()),
- 'resource_urls': resource_urls
+ "url": url,
+ "cookies": props["cookies"],
+ "index_path": index_path,
+ "resources": list(resource_urls.values()),
+ "resource_urls": resource_urls,
}
LOGGER.info("archive_page finished...")
return page_info
@@ -683,7 +835,7 @@ def download_in_parallel(urls, func=None, max_workers=5):
class ArchiveDownloader:
def __init__(self, root_dir, relative_links=True):
self.root_dir = root_dir
- self.cache_file = os.path.join(self.root_dir, 'archive_files.json')
+ self.cache_file = os.path.join(self.root_dir, "archive_files.json")
self.cache_data = {}
# This is temporarily configurable for ArchiveDownloader-based chefs that
@@ -702,51 +854,64 @@ def __del__(self):
archiver = None
def save_cache_data(self):
- with open(self.cache_file, 'w') as f:
+ with open(self.cache_file, "w") as f:
f.write(json.dumps(self.cache_data, ensure_ascii=False, indent=2))
def clear_cache_data(self):
self.cache_data = {}
self.save_cache_data()
- def get_page(self, url, refresh=False, link_policy=None, run_js=False, strict=False):
+ def get_page(
+ self, url, refresh=False, link_policy=None, run_js=False, strict=False
+ ):
if refresh or not url in self.cache_data:
- self.cache_data[url] = archive_page(url, download_root=self.root_dir, link_policy=link_policy, run_js=run_js, strict=strict, relative_links=self.relative_links)
+ self.cache_data[url] = archive_page(
+ url,
+ download_root=self.root_dir,
+ link_policy=link_policy,
+ run_js=run_js,
+ strict=strict,
+ relative_links=self.relative_links,
+ )
self.save_cache_data()
return self.cache_data[url]
def get_relative_index_path(self, url):
- if url in self.cache_data and 'index_path' in self.cache_data[url]:
+ if url in self.cache_data and "index_path" in self.cache_data[url]:
if not self.relative_links:
# we copy the main page to index.html in the root of the page archive.
return "index.html"
- return self.cache_data[url]['index_path'].replace(self.root_dir + os.sep, '')
+ return self.cache_data[url]["index_path"].replace(
+ self.root_dir + os.sep, ""
+ )
return None
def find_page_by_index_path(self, index_path):
for url in self.cache_data:
- if self.cache_data[url]['index_path'] == index_path:
+ if self.cache_data[url]["index_path"] == index_path:
return self.cache_data[url]
return None
def get_page_soup(self, url):
if not url in self.cache_data:
- raise KeyError("Unable to find page {} in archive. Did you call get_page?".format(url))
+ raise KeyError(
+ "Unable to find page {} in archive. Did you call get_page?".format(url)
+ )
info = self.cache_data[url]
# lxml enables some nice features like being able to search for individual
# class names using BeautifulSoup, so let's just require it.
- soup = BeautifulSoup(open(info['index_path'], 'rb'), features='lxml')
+ soup = BeautifulSoup(open(info["index_path"], "rb"), features="lxml")
return soup
def create_dependency_zip(self, count_threshold=2):
resource_counts = {}
for url in self.cache_data:
info = self.cache_data[url]
- resources = info['resources']
+ resources = info["resources"]
for resource in resources.values():
if not resource in resource_counts:
resource_counts[resource] = 0
@@ -767,8 +932,8 @@ def _copy_resources_to_dir(self, base_dir, resources):
for res in resources:
res_path = res
if res_path.startswith(self.root_dir):
- res_path = res_path.replace(self.root_dir, '')
- if res_path.startswith('/'):
+ res_path = res_path.replace(self.root_dir, "")
+ if res_path.startswith("/"):
res_path = res_path[1:]
full_path = os.path.join(self.root_dir, res_path)
dest_path = os.path.join(base_dir, res_path)
@@ -778,21 +943,23 @@ def _copy_resources_to_dir(self, base_dir, resources):
def create_zip_dir_for_page(self, url):
if not url in self.cache_data:
- raise KeyError("Please ensure you call get_page before calling this function to download the content.")
+ raise KeyError(
+ "Please ensure you call get_page before calling this function to download the content."
+ )
temp_dir = tempfile.mkdtemp()
info = self.cache_data[url]
# TODO: Add dependency zip handling that replaces links with the dependency zip location
- self._copy_resources_to_dir(temp_dir, info['resources'])
- for res_url in info['resource_urls']:
+ self._copy_resources_to_dir(temp_dir, info["resources"])
+ for res_url in info["resource_urls"]:
if res_url in self.cache_data:
- resources = self.cache_data[res_url]['resources']
+ resources = self.cache_data[res_url]["resources"]
self._copy_resources_to_dir(temp_dir, resources)
index_path = self.get_relative_index_path(url)
os.makedirs(os.path.dirname(os.path.join(temp_dir, index_path)), exist_ok=True)
- shutil.copy(info['index_path'], os.path.join(temp_dir, index_path))
+ shutil.copy(info["index_path"], os.path.join(temp_dir, index_path))
return temp_dir
def export_page_as_zip(self, url):
diff --git a/ricecooker/utils/encodings.py b/ricecooker/utils/encodings.py
index cefcae37..b323a329 100644
--- a/ricecooker/utils/encodings.py
+++ b/ricecooker/utils/encodings.py
@@ -1,24 +1,25 @@
-import re
import base64
+import re
-BASE64_REGEX_STR = r'data:image\/([A-Za-z]*);base64,((?:[A-Za-z0-9+\/]{4})*(?:[A-Za-z0-9+\/]{2}==|[A-Za-z0-9+\/]{3}=)*)'
+BASE64_REGEX_STR = r"data:image\/([A-Za-z]*);base64,((?:[A-Za-z0-9+\/]{4})*(?:[A-Za-z0-9+\/]{2}==|[A-Za-z0-9+\/]{3}=)*)"
BASE64_REGEX = re.compile(BASE64_REGEX_STR, flags=re.IGNORECASE)
def get_base64_encoding(text):
- """ get_base64_encoding: Get the first base64 match or None
- Args:
- text (str): text to check for base64 encoding
- Returns: First match in text
+ """get_base64_encoding: Get the first base64 match or None
+ Args:
+ text (str): text to check for base64 encoding
+ Returns: First match in text
"""
return BASE64_REGEX.search(text)
+
def write_base64_to_file(encoding, fpath_out):
- """ write_base64_to_file: Convert base64 image to file
- Args:
- encoding (str): base64 encoded string
- fpath_out (str): path to file to write
- Returns: None
+ """write_base64_to_file: Convert base64 image to file
+ Args:
+ encoding (str): base64 encoded string
+ fpath_out (str): path to file to write
+ Returns: None
"""
encoding_match = get_base64_encoding(encoding)
@@ -26,14 +27,15 @@ def write_base64_to_file(encoding, fpath_out):
assert encoding_match, "Error writing to file: Invalid base64 encoding"
with open(fpath_out, "wb") as target_file:
- target_file.write(base64.decodebytes(encoding_match.group(2).encode('utf-8')))
+ target_file.write(base64.decodebytes(encoding_match.group(2).encode("utf-8")))
+
def encode_file_to_base64(fpath_in, prefix):
- """ encode_file_to_base64: gets base64 encoding of file
- Args:
- fpath_in (str): path to file to encode
- prefix (str): file data for encoding (e.g. 'data:image/png;base64,')
- Returns: base64 encoding of file
+ """encode_file_to_base64: gets base64 encoding of file
+ Args:
+ fpath_in (str): path to file to encode
+ prefix (str): file data for encoding (e.g. 'data:image/png;base64,')
+ Returns: base64 encoding of file
"""
- with open(fpath_in, 'rb') as file_obj:
- return prefix + base64.b64encode(file_obj.read()).decode('utf-8')
+ with open(fpath_in, "rb") as file_obj:
+ return prefix + base64.b64encode(file_obj.read()).decode("utf-8")
diff --git a/ricecooker/utils/html.py b/ricecooker/utils/html.py
index c2920855..f2325de2 100644
--- a/ricecooker/utils/html.py
+++ b/ricecooker/utils/html.py
@@ -1,51 +1,56 @@
import logging
import os
import re
-import requests
import signal
import time
import urllib
-
-import chardet
-
-from bs4 import BeautifulSoup
from selenium import webdriver
-from urllib.parse import urlparse, unquote
+from urllib.parse import unquote
+from urllib.parse import urlparse
from urllib.request import pathname2url
-from .caching import FileCache, CacheControlAdapter
-from ricecooker.config import LOGGER, PHANTOMJS_PATH, STRICT
+import chardet
+import requests
+from bs4 import BeautifulSoup
+from .caching import CacheControlAdapter
+from .caching import FileCache
+from ricecooker.config import LOGGER
+from ricecooker.config import PHANTOMJS_PATH
+from ricecooker.config import STRICT
# create a default session with basic caching mechanisms (similar to what a browser would do)
sess = requests.Session()
-cache = FileCache('.webcache', use_dir_lock=True)
+cache = FileCache(".webcache", use_dir_lock=True)
basic_adapter = CacheControlAdapter(cache=cache)
-sess.mount('http://', basic_adapter)
-sess.mount('https://', basic_adapter)
+sess.mount("http://", basic_adapter)
+sess.mount("https://", basic_adapter)
if PHANTOMJS_PATH is None:
- PHANTOMJS_PATH = os.path.join(os.getcwd(), "node_modules", "phantomjs-prebuilt", "bin", "phantomjs")
+ PHANTOMJS_PATH = os.path.join(
+ os.getcwd(), "node_modules", "phantomjs-prebuilt", "bin", "phantomjs"
+ )
class WebDriver(object):
-
def __init__(self, url, delay=1000):
self.url = url
self.delay = delay
def __enter__(self):
if not os.path.isfile(PHANTOMJS_PATH):
- raise Exception("You must install phantomjs-prebuilt in the directory"
- " you're running in with `npm install phantomjs-prebuilt`"
- " or set the environment variable `PHANTOMJS_PATH`")
+ raise Exception(
+ "You must install phantomjs-prebuilt in the directory"
+ " you're running in with `npm install phantomjs-prebuilt`"
+ " or set the environment variable `PHANTOMJS_PATH`"
+ )
self.driver = webdriver.PhantomJS(executable_path=PHANTOMJS_PATH)
self.driver.get(self.url)
time.sleep(self.delay / 1000.0)
return self.driver
- def __exit__(self ,type, value, traceback):
+ def __exit__(self, type, value, traceback):
# driver.quit() by itself doesn't suffice to fully terminate spawned
# PhantomJS processes:
# see https://github.com/seleniumhq/selenium/issues/767
@@ -54,10 +59,16 @@ def __exit__(self ,type, value, traceback):
def get_generated_html_from_driver(driver, tagname="html"):
- driver.execute_script("return document.getElementsByTagName('{tagname}')[0].innerHTML".format(tagname=tagname))
+ driver.execute_script(
+ "return document.getElementsByTagName('{tagname}')[0].innerHTML".format(
+ tagname=tagname
+ )
+ )
-def replace_links(content, urls_to_replace, download_root=None, content_dir=None, relative_links=False):
+def replace_links(
+ content, urls_to_replace, download_root=None, content_dir=None, relative_links=False
+):
for key in urls_to_replace:
value = urls_to_replace[key]
if key == value:
@@ -76,16 +87,18 @@ def replace_links(content, urls_to_replace, download_root=None, content_dir=None
rel_path = pathname2url(rel_path)
if relative_links:
- value = pathname2url(os.path.relpath(os.path.join(download_root, value), content_dir))
+ value = pathname2url(
+ os.path.relpath(os.path.join(download_root, value), content_dir)
+ )
# When we get an absolute URL, it may appear in one of three different ways in the page:
key_variants = [
# 1. /path/to/file.html
- key.replace(url_parts.scheme + '://' + url_parts.netloc, ''),
+ key.replace(url_parts.scheme + "://" + url_parts.netloc, ""),
# 2. https://www.domain.com/path/to/file.html
key,
# 3. //www.domain.com/path/to/file.html
- key.replace(url_parts.scheme + ':', ''),
+ key.replace(url_parts.scheme + ":", ""),
]
if rel_path and content_dir:
@@ -107,7 +120,9 @@ def replace_links(content, urls_to_replace, download_root=None, content_dir=None
# we avoid using BeautifulSoup because Python HTML parsers can be destructive and
# do things like strip out the doctype.
content = content.replace('="{}"'.format(variant), '="{}"'.format(value))
- content = content.replace('url({})'.format(variant), 'url({})'.format(value))
+ content = content.replace(
+ "url({})".format(variant), "url({})".format(value)
+ )
for match in srcset_links:
url = match[1]
@@ -145,9 +160,13 @@ def calculate_relative_url(url, filename=None, baseurl=None, subpath=None):
# if a base path was supplied, calculate the file's subpath relative to it
if baseurl:
- baseurl = urllib.parse.urljoin(baseurl, ".") # ensure baseurl is normalized (to remove '/./' and '/../')
- assert url.startswith(baseurl), "URL {} must start with baseurl {}".format(url, baseurl)
- subpath = subpath + url[len(baseurl):].strip("/").split("/")[:-1]
+ baseurl = urllib.parse.urljoin(
+ baseurl, "."
+ ) # ensure baseurl is normalized (to remove '/./' and '/../')
+ assert url.startswith(baseurl), "URL {} must start with baseurl {}".format(
+ url, baseurl
+ )
+ subpath = subpath + url[len(baseurl) :].strip("/").split("/")[:-1]
# if we don't have a filename, extract it from the URL
if not filename:
@@ -159,7 +178,16 @@ def calculate_relative_url(url, filename=None, baseurl=None, subpath=None):
return relative_file_url, subpath, filename
-def download_file(url, destpath, filename=None, baseurl=None, subpath=None, middleware_callbacks=None, middleware_kwargs=None, request_fn=sess.get):
+def download_file(
+ url,
+ destpath,
+ filename=None,
+ baseurl=None,
+ subpath=None,
+ middleware_callbacks=None,
+ middleware_kwargs=None,
+ request_fn=sess.get,
+):
"""
Download a file from a URL, into a destination folder, with optional use of relative paths and middleware processors.
@@ -170,7 +198,9 @@ def download_file(url, destpath, filename=None, baseurl=None, subpath=None, midd
- If `middleware_kwargs` are also specified, they will also be passed in to each function in middleware_callbacks.
"""
- relative_file_url, subpath, filename = calculate_relative_url(url, filename=filename, baseurl=baseurl, subpath=subpath)
+ relative_file_url, subpath, filename = calculate_relative_url(
+ url, filename=filename, baseurl=baseurl, subpath=subpath
+ )
LOGGER.info("Download called for {}".format(url))
# ensure that the destination directory exists
@@ -188,18 +218,20 @@ def download_file(url, destpath, filename=None, baseurl=None, subpath=None, midd
# if there are any middleware callbacks, apply them to the content
if middleware_callbacks:
- if 'content-type' in response.headers:
- type = response.headers['content-type'].split(';')[0]
+ if "content-type" in response.headers:
+ type = response.headers["content-type"].split(";")[0]
# Rely on requests to convert bytes to unicode for us when it's a text file
# otherwise, we just use bytes
- if type.startswith('text'):
+ if type.startswith("text"):
# It seems requests defaults to ISO-8859-1 when the headers don't explicitly declare an
# encoding. In this case, we're better off using chardet to guess instead.
if not response.encoding:
encoding = chardet.detect(response.content)
- if encoding and 'encoding' in encoding:
- response.encoding = encoding['encoding']
- LOGGER.warning("encoding for {} = {}".format(url, response.encoding))
+ if encoding and "encoding" in encoding:
+ response.encoding = encoding["encoding"]
+ LOGGER.warning(
+ "encoding for {} = {}".format(url, response.encoding)
+ )
content = response.text
if not isinstance(middleware_callbacks, list):
@@ -219,7 +251,7 @@ def download_file(url, destpath, filename=None, baseurl=None, subpath=None, midd
# ensure content is encoded, as we're doing a binary write
if isinstance(content, str):
- content = content.encode('utf-8')
+ content = content.encode("utf-8")
# calculate the final destination for the file, and write the content out to there
dest = os.path.join(fulldestpath, filename)
diff --git a/ricecooker/utils/html_writer.py b/ricecooker/utils/html_writer.py
index 8b12d8e1..7792d823 100644
--- a/ricecooker/utils/html_writer.py
+++ b/ricecooker/utils/html_writer.py
@@ -1,20 +1,22 @@
import os
import zipfile
+
from ricecooker.utils.downloader import read
-class HTMLWriter():
+
+class HTMLWriter:
"""
- Class for writing zipfiles
+ Class for writing zipfiles
"""
- zf = None # Zip file to write to
- write_to_path = None # Where to write zip file
+ zf = None # Zip file to write to
+ write_to_path = None # Where to write zip file
def __init__(self, write_to_path, mode="w"):
""" Args: write_to_path: (str) where to write zip file """
- self.map = {} # Keeps track of content to write to csv
+ self.map = {} # Keeps track of content to write to csv
self.write_to_path = write_to_path # Where to write zip file
- self.mode = mode # What mode to open zipfile in
+ self.mode = mode # What mode to open zipfile in
def __enter__(self):
""" Called when opening context (e.g. with HTMLWriter() as writer: ) """
@@ -41,49 +43,55 @@ def _copy_to_zipfile(self, filepath, arcname=None):
""" USER-FACING METHODS """
def open(self):
- """ open: Opens zipfile to write to
- Args: None
- Returns: None
+ """open: Opens zipfile to write to
+ Args: None
+ Returns: None
"""
self.zf = zipfile.ZipFile(self.write_to_path, self.mode)
def close(self):
- """ close: Close zipfile when done
- Args: None
- Returns: None
+ """close: Close zipfile when done
+ Args: None
+ Returns: None
"""
- index_present = self.contains('index.html')
- self.zf.close() # Make sure zipfile closes no matter what
+ index_present = self.contains("index.html")
+ self.zf.close() # Make sure zipfile closes no matter what
if not index_present:
- raise ReferenceError("Invalid Zip at {}: missing index.html file (use write_index_contents method)".format(self.write_to_path))
+ raise ReferenceError(
+ "Invalid Zip at {}: missing index.html file (use write_index_contents method)".format(
+ self.write_to_path
+ )
+ )
def contains(self, filename):
- """ contains: Checks if filename is in the zipfile
- Args: filename: (str) name of file to check
- Returns: boolean indicating whether or not filename is in the zip
+ """contains: Checks if filename is in the zipfile
+ Args: filename: (str) name of file to check
+ Returns: boolean indicating whether or not filename is in the zip
"""
return filename in self.zf.namelist()
def write_contents(self, filename, contents, directory=None):
- """ write_contents: Write contents to filename in zip
- Args:
- contents: (str) contents of file
- filename: (str) name of file in zip
- directory: (str) directory in zipfile to write file to (optional)
- Returns: path to file in zip
+ """write_contents: Write contents to filename in zip
+ Args:
+ contents: (str) contents of file
+ filename: (str) name of file in zip
+ directory: (str) directory in zipfile to write file to (optional)
+ Returns: path to file in zip
"""
- filepath = "{}/{}".format(directory.rstrip("/"), filename) if directory else filename
+ filepath = (
+ "{}/{}".format(directory.rstrip("/"), filename) if directory else filename
+ )
self._write_to_zipfile(filepath, contents)
return filepath
def write_file(self, filepath, filename=None, directory=None):
- """ write_file: Write local file to zip
- Args:
- filepath: (str) location to local file
- directory: (str) directory in zipfile to write file to (optional)
- Returns: path to file in zip
+ """write_file: Write local file to zip
+ Args:
+ filepath: (str) location to local file
+ directory: (str) directory in zipfile to write file to (optional)
+ Returns: path to file in zip
- Note: filepath must be a relative path
+ Note: filepath must be a relative path
"""
arcname = None
if filename or directory:
@@ -94,22 +102,24 @@ def write_file(self, filepath, filename=None, directory=None):
return arcname or filepath
def write_url(self, url, filename, directory=None):
- """ write_url: Write contents from url to filename in zip
- Args:
- url: (str) url to file to download
- filename: (str) name of file in zip
- directory: (str) directory in zipfile to write file to (optional)
- Returns: path to file in zip
+ """write_url: Write contents from url to filename in zip
+ Args:
+ url: (str) url to file to download
+ filename: (str) name of file in zip
+ directory: (str) directory in zipfile to write file to (optional)
+ Returns: path to file in zip
"""
- filepath = "{}/{}".format(directory.rstrip("/"), filename) if directory else filename
+ filepath = (
+ "{}/{}".format(directory.rstrip("/"), filename) if directory else filename
+ )
if not self.contains(filepath):
self._write_to_zipfile(filepath, read(url))
return filepath
def write_index_contents(self, contents):
- """ write_index_contents: Write main index file to zip
- Args:
- contents: (str) contents of file
- Returns: path to file in zip
+ """write_index_contents: Write main index file to zip
+ Args:
+ contents: (str) contents of file
+ Returns: path to file in zip
"""
- self._write_to_zipfile('index.html', contents)
+ self._write_to_zipfile("index.html", contents)
diff --git a/ricecooker/utils/images.py b/ricecooker/utils/images.py
index 3fccd630..3ac77425 100644
--- a/ricecooker/utils/images.py
+++ b/ricecooker/utils/images.py
@@ -1,22 +1,20 @@
import os
import zipfile
-import ebooklib
-import ebooklib.epub
from io import BytesIO
-
+import ebooklib.epub
from pdf2image import convert_from_path
from PIL import Image
from .thumbscropping import scale_and_crop
-
# SMARTCROP UTILS
################################################################################
THUMBNAIL_SIZE = (400, 225) # 16:9 aspect ratio
+
def scale_and_crop_thumbnail(image, size=THUMBNAIL_SIZE, crop="smart", **kwargs):
"""
Scale and crop the PIL Image ``image`` to maximum dimensions of ``size``.
@@ -32,10 +30,10 @@ def scale_and_crop_thumbnail(image, size=THUMBNAIL_SIZE, crop="smart", **kwargs)
return scale_and_crop(image, size, crop=crop, upscale=True, **kwargs)
-
# THUMBNAILS FOR CONTENT KINDS
################################################################################
+
def create_image_from_epub(epubfile, fpath_out, crop=None):
"""
Generate a thumbnail image from `epubfile` and save it to `fpath_out`.
@@ -45,10 +43,10 @@ def create_image_from_epub(epubfile, fpath_out, crop=None):
book = ebooklib.epub.read_epub(epubfile)
# 1. try to get cover image from book metadata (content.opf)
cover_item = None
- covers = book.get_metadata('http://www.idpf.org/2007/opf', 'cover')
+ covers = book.get_metadata("http://www.idpf.org/2007/opf", "cover")
if covers:
- cover_tuple = covers[0] # ~= (None, {'name':'cover', 'content':'item1'})
- cover_item_id = cover_tuple[1]['content']
+ cover_tuple = covers[0] # ~= (None, {'name':'cover', 'content':'item1'})
+ cover_item_id = cover_tuple[1]["content"]
for item in book.items:
if item.id == cover_item_id:
cover_item = item
@@ -58,7 +56,9 @@ def create_image_from_epub(epubfile, fpath_out, crop=None):
# 2. fallback to get first image in the ePub file
images = list(book.get_items_of_type(ebooklib.ITEM_IMAGE))
if not images:
- raise ThumbnailGenerationError("ePub file {} contains no images.".format(epubfile))
+ raise ThumbnailGenerationError(
+ "ePub file {} contains no images.".format(epubfile)
+ )
# TODO: get largest image of the bunch
image_data = BytesIO(images[0].get_content())
@@ -78,9 +78,9 @@ def create_image_from_zip(htmlfile, fpath_out, crop="smart"):
biggest_name = None
size = 0
try:
- with zipfile.ZipFile(htmlfile, 'r') as zf:
+ with zipfile.ZipFile(htmlfile, "r") as zf:
# get the biggest (most pixels) image in the zip
- image_exts = ['png', 'PNG', 'jpeg', 'JPEG', 'jpg', 'JPG']
+ image_exts = ["png", "PNG", "jpeg", "JPEG", "jpg", "JPG"]
for filename in zf.namelist():
_, dotext = os.path.splitext(filename)
ext = dotext[1:]
@@ -94,7 +94,9 @@ def create_image_from_zip(htmlfile, fpath_out, crop="smart"):
biggest_name = filename
size = img_size
if biggest_name is None:
- raise ThumbnailGenerationError("HTML5 zip file {} contains no images.".format(htmlfile))
+ raise ThumbnailGenerationError(
+ "HTML5 zip file {} contains no images.".format(htmlfile)
+ )
with zf.open(biggest_name) as fhandle:
image_data = fhandle.read()
with BytesIO(image_data) as bhandle:
@@ -110,12 +112,14 @@ def create_image_from_pdf_page(fpath_in, fpath_out, page_number=0, crop=None):
Create an image from the pdf at fpath_in and write result to fpath_out.
"""
try:
- assert fpath_in.endswith('pdf'), "File must be in pdf format"
- pages = convert_from_path(fpath_in, 500, first_page=page_number, last_page=page_number+1)
+ assert fpath_in.endswith("pdf"), "File must be in pdf format"
+ pages = convert_from_path(
+ fpath_in, 500, first_page=page_number, last_page=page_number + 1
+ )
page = pages[0]
# resize
page = scale_and_crop_thumbnail(page, zoom=10, crop=crop)
- page.save(fpath_out, 'PNG')
+ page.save(fpath_out, "PNG")
except Exception as e:
raise ThumbnailGenerationError("Fail on PDF {} {}".format(fpath_in, e))
@@ -123,33 +127,38 @@ def create_image_from_pdf_page(fpath_in, fpath_out, page_number=0, crop=None):
# TILED THUMBNAILS FOR TOPIC NODES (FOLDERS)
################################################################################
+
def create_tiled_image(source_images, fpath_out):
"""
Create a 16:9 tiled image from list of image paths provided in source_images
and write result to fpath_out.
"""
try:
- sizes = {1:1, 4:2, 9:3, 16:4, 25:5, 36:6, 49:7}
- assert len(source_images) in sizes.keys(), "Number of images must be a perfect square <= 49"
+ sizes = {1: 1, 4: 2, 9: 3, 16: 4, 25: 5, 36: 6, 49: 7}
+ assert (
+ len(source_images) in sizes.keys()
+ ), "Number of images must be a perfect square <= 49"
root = sizes[len(source_images)]
images = list(map(Image.open, source_images))
- new_im = Image.new('RGBA', THUMBNAIL_SIZE)
- offset = (int(float(THUMBNAIL_SIZE[0]) / float(root)),
- int(float(THUMBNAIL_SIZE[1]) / float(root)) )
+ new_im = Image.new("RGBA", THUMBNAIL_SIZE)
+ offset = (
+ int(float(THUMBNAIL_SIZE[0]) / float(root)),
+ int(float(THUMBNAIL_SIZE[1]) / float(root)),
+ )
index = 0
for y_index in range(root):
for x_index in range(root):
im = scale_and_crop_thumbnail(images[index], size=offset)
- new_im.paste(im, (int(offset[0]*x_index), int(offset[1]*y_index)))
+ new_im.paste(im, (int(offset[0] * x_index), int(offset[1] * y_index)))
index = index + 1
new_im.save(fpath_out)
except Exception as e:
raise ThumbnailGenerationError("Failed due to {}".format(e))
-def convert_image(filename, dest_dir=None, size=None, format='PNG'):
+def convert_image(filename, dest_dir=None, size=None, format="PNG"):
"""
Converts an image to a specified output format. The converted image will have the same
file basename as filename, but with the extension of the converted format.
@@ -162,7 +171,9 @@ def convert_image(filename, dest_dir=None, size=None, format='PNG'):
:returns: Path to converted file.
"""
- assert os.path.exists(filename), "Image file not found: {}".format(os.path.abspath(filename))
+ assert os.path.exists(filename), "Image file not found: {}".format(
+ os.path.abspath(filename)
+ )
if not dest_dir:
dest_dir = os.path.dirname(os.path.abspath(filename))
@@ -187,8 +198,10 @@ def convert_image(filename, dest_dir=None, size=None, format='PNG'):
# EXCEPTIONS
################################################################################
+
class ThumbnailGenerationError(Exception):
"""
Custom error returned when thumbnail extraction process fails.
"""
+
pass
diff --git a/ricecooker/utils/jsontrees.py b/ricecooker/utils/jsontrees.py
index a864609e..f8abc66a 100644
--- a/ricecooker/utils/jsontrees.py
+++ b/ricecooker/utils/jsontrees.py
@@ -1,17 +1,20 @@
import json
import os
-from ricecooker.classes import files, nodes, questions
+from le_utils.constants import content_kinds
+from le_utils.constants import roles
+
+from ricecooker.classes import files
+from ricecooker.classes import nodes
+from ricecooker.classes import questions
from ricecooker.classes.licenses import get_license
-from ricecooker.config import LOGGER
-from ricecooker.exceptions import UnknownFileTypeError, UnknownQuestionTypeError
from ricecooker.classes.nodes import ChannelNode
+from ricecooker.config import LOGGER
+from ricecooker.exceptions import UnknownFileTypeError
+from ricecooker.exceptions import UnknownQuestionTypeError
# CONSTANTS USED TO SELECT APPROPRIATE CLASS DURING DESERIALIZATION FROM JSON
################################################################################
-from le_utils.constants import roles
-
-from le_utils.constants import content_kinds
TOPIC_NODE = content_kinds.TOPIC
VIDEO_NODE = content_kinds.VIDEO
@@ -46,6 +49,7 @@
# JSON READ/WRITE HELPERS
################################################################################
+
def read_tree_from_json(srcpath):
"""
Load ricecooker json tree data from json file at `srcpath`.
@@ -53,7 +57,7 @@ def read_tree_from_json(srcpath):
with open(srcpath) as infile:
json_tree = json.load(infile)
if json_tree is None:
- raise ValueError('Could not find ricecooker json tree')
+ raise ValueError("Could not find ricecooker json tree")
return json_tree
@@ -64,25 +68,26 @@ def write_tree_to_json_tree(destpath, json_tree):
parent_dir, _ = os.path.split(destpath)
if not os.path.exists(parent_dir):
os.makedirs(parent_dir, exist_ok=True)
- with open(destpath, 'w', encoding='utf8') as json_file:
+ with open(destpath, "w", encoding="utf8") as json_file:
json.dump(json_tree, json_file, indent=2, ensure_ascii=False)
# CONSTRUCT CHANNEL FROM RICECOOKER JSON TREE
################################################################################
+
def get_channel_node_from_json(json_tree):
"""
Build `ChannelNode` from json data provided in `json_tree`.
"""
channel = ChannelNode(
- title=json_tree['title'],
- description=json_tree['description'],
- source_domain=json_tree['source_domain'],
- source_id=json_tree['source_id'],
- language=json_tree['language'],
- tagline=json_tree.get('tagline', None),
- thumbnail=json_tree.get('thumbnail', None),
+ title=json_tree["title"],
+ description=json_tree["description"],
+ source_domain=json_tree["source_domain"],
+ source_id=json_tree["source_id"],
+ language=json_tree["language"],
+ tagline=json_tree.get("tagline", None),
+ thumbnail=json_tree.get("thumbnail", None),
)
return channel
@@ -92,262 +97,264 @@ def build_tree_from_json(parent_node, sourcetree):
Recusively parse nodes in the list `sourcetree` and add them as children
to the `parent_node`. Usually called with `parent_node` being a `ChannelNode`.
"""
- EXPECTED_NODE_TYPES = [TOPIC_NODE, VIDEO_NODE, AUDIO_NODE, EXERCISE_NODE,
- DOCUMENT_NODE, HTML5_NODE, SLIDESHOW_NODE]
+ EXPECTED_NODE_TYPES = [
+ TOPIC_NODE,
+ VIDEO_NODE,
+ AUDIO_NODE,
+ EXERCISE_NODE,
+ DOCUMENT_NODE,
+ HTML5_NODE,
+ SLIDESHOW_NODE,
+ ]
for source_node in sourcetree:
- kind = source_node['kind']
+ kind = source_node["kind"]
if kind not in EXPECTED_NODE_TYPES:
- LOGGER.critical('Unexpected node kind found: ' + kind)
- raise NotImplementedError('Unexpected node kind found in json data.')
+ LOGGER.critical("Unexpected node kind found: " + kind)
+ raise NotImplementedError("Unexpected node kind found in json data.")
if kind == TOPIC_NODE:
child_node = nodes.TopicNode(
- source_id=source_node.get('source_id', None),
- title=source_node['title'],
- description=source_node.get('description'),
- author=source_node.get('author'),
- aggregator=source_node.get('aggregator'),
- provider=source_node.get('provider'),
+ source_id=source_node.get("source_id", None),
+ title=source_node["title"],
+ description=source_node.get("description"),
+ author=source_node.get("author"),
+ aggregator=source_node.get("aggregator"),
+ provider=source_node.get("provider"),
# no role for topics (computed dynaically from descendants)
- language=source_node.get('language'),
- thumbnail=source_node.get('thumbnail'),
- derive_thumbnail=source_node.get('derive_thumbnail', False),
- tags=source_node.get('tags'),
+ language=source_node.get("language"),
+ thumbnail=source_node.get("thumbnail"),
+ derive_thumbnail=source_node.get("derive_thumbnail", False),
+ tags=source_node.get("tags"),
)
parent_node.add_child(child_node)
- source_tree_children = source_node.get('children', [])
+ source_tree_children = source_node.get("children", [])
build_tree_from_json(child_node, source_tree_children)
elif kind == VIDEO_NODE:
child_node = nodes.VideoNode(
- source_id=source_node['source_id'],
- title=source_node['title'],
- description=source_node.get('description'),
- license=get_license(**source_node['license']),
- author=source_node.get('author'),
- aggregator=source_node.get('aggregator'),
- provider=source_node.get('provider'),
- role=source_node.get('role', roles.LEARNER),
- language=source_node.get('language'),
- thumbnail=source_node.get('thumbnail'),
- derive_thumbnail=source_node.get('derive_thumbnail', False),
- tags=source_node.get('tags'),
+ source_id=source_node["source_id"],
+ title=source_node["title"],
+ description=source_node.get("description"),
+ license=get_license(**source_node["license"]),
+ author=source_node.get("author"),
+ aggregator=source_node.get("aggregator"),
+ provider=source_node.get("provider"),
+ role=source_node.get("role", roles.LEARNER),
+ language=source_node.get("language"),
+ thumbnail=source_node.get("thumbnail"),
+ derive_thumbnail=source_node.get("derive_thumbnail", False),
+ tags=source_node.get("tags"),
)
- add_files(child_node, source_node.get('files') or [])
+ add_files(child_node, source_node.get("files") or [])
parent_node.add_child(child_node)
elif kind == AUDIO_NODE:
child_node = nodes.AudioNode(
- source_id=source_node['source_id'],
- title=source_node['title'],
- description=source_node.get('description'),
- license=get_license(**source_node['license']),
- author=source_node.get('author'),
- aggregator=source_node.get('aggregator'),
- provider=source_node.get('provider'),
- role=source_node.get('role', roles.LEARNER),
- language=source_node.get('language'),
- thumbnail=source_node.get('thumbnail'),
- derive_thumbnail=source_node.get('derive_thumbnail', False),
- tags=source_node.get('tags'),
+ source_id=source_node["source_id"],
+ title=source_node["title"],
+ description=source_node.get("description"),
+ license=get_license(**source_node["license"]),
+ author=source_node.get("author"),
+ aggregator=source_node.get("aggregator"),
+ provider=source_node.get("provider"),
+ role=source_node.get("role", roles.LEARNER),
+ language=source_node.get("language"),
+ thumbnail=source_node.get("thumbnail"),
+ derive_thumbnail=source_node.get("derive_thumbnail", False),
+ tags=source_node.get("tags"),
)
- add_files(child_node, source_node.get('files') or [])
+ add_files(child_node, source_node.get("files") or [])
parent_node.add_child(child_node)
elif kind == EXERCISE_NODE:
child_node = nodes.ExerciseNode(
- source_id=source_node['source_id'],
- title=source_node['title'],
- description=source_node.get('description'),
- license=get_license(**source_node['license']),
- author=source_node.get('author'),
- aggregator=source_node.get('aggregator'),
- provider=source_node.get('provider'),
- role=source_node.get('role', roles.LEARNER),
- language=source_node.get('language'),
- thumbnail=source_node.get('thumbnail'),
- derive_thumbnail=source_node.get('derive_thumbnail', False), # not supported yet
- tags=source_node.get('tags'),
- exercise_data=source_node.get('exercise_data'),
+ source_id=source_node["source_id"],
+ title=source_node["title"],
+ description=source_node.get("description"),
+ license=get_license(**source_node["license"]),
+ author=source_node.get("author"),
+ aggregator=source_node.get("aggregator"),
+ provider=source_node.get("provider"),
+ role=source_node.get("role", roles.LEARNER),
+ language=source_node.get("language"),
+ thumbnail=source_node.get("thumbnail"),
+ derive_thumbnail=source_node.get(
+ "derive_thumbnail", False
+ ), # not supported yet
+ tags=source_node.get("tags"),
+ exercise_data=source_node.get("exercise_data"),
questions=[],
)
- add_questions(child_node, source_node.get('questions') or [])
+ add_questions(child_node, source_node.get("questions") or [])
parent_node.add_child(child_node)
elif kind == DOCUMENT_NODE:
child_node = nodes.DocumentNode(
- source_id=source_node['source_id'],
- title=source_node['title'],
- description=source_node.get('description'),
- license=get_license(**source_node['license']),
- author=source_node.get('author'),
- aggregator=source_node.get('aggregator'),
- provider=source_node.get('provider'),
- role=source_node.get('role', roles.LEARNER),
- language=source_node.get('language'),
- thumbnail=source_node.get('thumbnail'),
- tags=source_node.get('tags'),
+ source_id=source_node["source_id"],
+ title=source_node["title"],
+ description=source_node.get("description"),
+ license=get_license(**source_node["license"]),
+ author=source_node.get("author"),
+ aggregator=source_node.get("aggregator"),
+ provider=source_node.get("provider"),
+ role=source_node.get("role", roles.LEARNER),
+ language=source_node.get("language"),
+ thumbnail=source_node.get("thumbnail"),
+ tags=source_node.get("tags"),
)
- add_files(child_node, source_node.get('files') or [])
+ add_files(child_node, source_node.get("files") or [])
parent_node.add_child(child_node)
elif kind == HTML5_NODE:
child_node = nodes.HTML5AppNode(
- source_id=source_node['source_id'],
- title=source_node['title'],
- description=source_node.get('description'),
- license=get_license(**source_node['license']),
- author=source_node.get('author'),
- aggregator=source_node.get('aggregator'),
- provider=source_node.get('provider'),
- role=source_node.get('role', roles.LEARNER),
- language=source_node.get('language'),
- thumbnail=source_node.get('thumbnail'),
- derive_thumbnail=source_node.get('derive_thumbnail', False),
- tags=source_node.get('tags'),
+ source_id=source_node["source_id"],
+ title=source_node["title"],
+ description=source_node.get("description"),
+ license=get_license(**source_node["license"]),
+ author=source_node.get("author"),
+ aggregator=source_node.get("aggregator"),
+ provider=source_node.get("provider"),
+ role=source_node.get("role", roles.LEARNER),
+ language=source_node.get("language"),
+ thumbnail=source_node.get("thumbnail"),
+ derive_thumbnail=source_node.get("derive_thumbnail", False),
+ tags=source_node.get("tags"),
)
- add_files(child_node, source_node.get('files') or [])
+ add_files(child_node, source_node.get("files") or [])
parent_node.add_child(child_node)
elif kind == SLIDESHOW_NODE:
child_node = nodes.SlideshowNode(
- source_id=source_node['source_id'],
- title=source_node['title'],
- description=source_node.get('description'),
- license=get_license(**source_node['license']),
- author=source_node.get('author'),
- aggregator=source_node.get('aggregator'),
- provider=source_node.get('provider'),
- role=source_node.get('role', roles.LEARNER),
- language=source_node.get('language'),
- thumbnail=source_node.get('thumbnail'),
- derive_thumbnail=source_node.get('derive_thumbnail', False),
- tags=source_node.get('tags'),
-
+ source_id=source_node["source_id"],
+ title=source_node["title"],
+ description=source_node.get("description"),
+ license=get_license(**source_node["license"]),
+ author=source_node.get("author"),
+ aggregator=source_node.get("aggregator"),
+ provider=source_node.get("provider"),
+ role=source_node.get("role", roles.LEARNER),
+ language=source_node.get("language"),
+ thumbnail=source_node.get("thumbnail"),
+ derive_thumbnail=source_node.get("derive_thumbnail", False),
+ tags=source_node.get("tags"),
)
- add_files(child_node, source_node.get('files') or [])
+ add_files(child_node, source_node.get("files") or [])
parent_node.add_child(child_node)
# TODO: add support for H5P content kind
else:
- LOGGER.critical('Encountered an unknown kind: ' + str(source_node))
+ LOGGER.critical("Encountered an unknown kind: " + str(source_node))
continue
return parent_node
def add_files(node, file_list):
- EXPECTED_FILE_TYPES = [VIDEO_FILE, AUDIO_FILE, DOCUMENT_FILE, EPUB_FILE,
- HTML5_FILE, THUMBNAIL_FILE, SUBTITLES_FILE, SLIDESHOW_IMAGE_FILE]
+ EXPECTED_FILE_TYPES = [
+ VIDEO_FILE,
+ AUDIO_FILE,
+ DOCUMENT_FILE,
+ EPUB_FILE,
+ HTML5_FILE,
+ THUMBNAIL_FILE,
+ SUBTITLES_FILE,
+ SLIDESHOW_IMAGE_FILE,
+ ]
for f in file_list:
- file_type = f.get('file_type')
+ file_type = f.get("file_type")
if file_type not in EXPECTED_FILE_TYPES:
LOGGER.critical(file_type)
- raise NotImplementedError('Unexpected File type found in channel json.')
+ raise NotImplementedError("Unexpected File type found in channel json.")
- path = f.get('path') # path can be an URL or a local path (or None)
- preset = f.get('preset', None)
+ path = f.get("path") # path can be an URL or a local path (or None)
+ preset = f.get("preset", None)
# handle different types of files
if file_type == VIDEO_FILE:
# handle three types of video files
- if 'youtube_id' in f:
+ if "youtube_id" in f:
video_file = files.YouTubeVideoFile(
- youtube_id=f['youtube_id'],
- download_settings=f.get('download_settings', None),
- high_resolution=f.get('high_resolution', False),
- maxheight=f.get('maxheight', None),
- language=f.get('language', None),
- preset=preset
+ youtube_id=f["youtube_id"],
+ download_settings=f.get("download_settings", None),
+ high_resolution=f.get("high_resolution", False),
+ maxheight=f.get("maxheight", None),
+ language=f.get("language", None),
+ preset=preset,
)
- elif 'web_url' in f:
+ elif "web_url" in f:
video_file = files.WebVideoFile(
- web_url=f['web_url'],
- download_settings=f.get('download_settings', None),
- high_resolution=f.get('high_resolution', False),
- maxheight=f.get('maxheight', None),
- language=f.get('language', None),
- preset=preset
+ web_url=f["web_url"],
+ download_settings=f.get("download_settings", None),
+ high_resolution=f.get("high_resolution", False),
+ maxheight=f.get("maxheight", None),
+ language=f.get("language", None),
+ preset=preset,
)
else:
video_file = files.VideoFile(
- path=f['path'],
- language=f.get('language', None),
- ffmpeg_settings=f.get('ffmpeg_settings'),
-
+ path=f["path"],
+ language=f.get("language", None),
+ ffmpeg_settings=f.get("ffmpeg_settings"),
)
node.add_file(video_file)
elif file_type == AUDIO_FILE:
node.add_file(
files.AudioFile(
- path=f['path'],
- language=f.get('language', None),
- preset=preset
-
+ path=f["path"], language=f.get("language", None), preset=preset
)
)
elif file_type == DOCUMENT_FILE:
node.add_file(
files.DocumentFile(
- path=path,
- language=f.get('language', None),
- preset=preset
-
+ path=path, language=f.get("language", None), preset=preset
)
)
elif file_type == EPUB_FILE:
node.add_file(
files.EPubFile(
- path=path,
- language=f.get('language', None),
- preset=preset
-
+ path=path, language=f.get("language", None), preset=preset
)
)
elif file_type == HTML5_FILE:
node.add_file(
files.HTMLZipFile(
- path=path,
- language=f.get('language', None),
- preset=preset
-
+ path=path, language=f.get("language", None), preset=preset
)
)
elif file_type == THUMBNAIL_FILE:
- if 'encoding' in f:
+ if "encoding" in f:
node.add_file(
files.Base64ImageFile(
- encoding=f['encoding'],
+ encoding=f["encoding"],
)
)
else:
node.add_file(
files.ThumbnailFile(
path=path,
- language=f.get('language', None),
+ language=f.get("language", None),
)
)
elif file_type == SUBTITLES_FILE:
- if 'youtube_id' in f:
+ if "youtube_id" in f:
node.add_file(
files.YouTubeSubtitleFile(
- youtube_id=f['youtube_id'],
- language=f['language']
+ youtube_id=f["youtube_id"], language=f["language"]
)
)
else:
- keys = ['language', 'subtitlesformat']
- params = {'path': path}
+ keys = ["language", "subtitlesformat"]
+ params = {"path": path}
for key in keys:
if key in f:
params[key] = f[key]
@@ -357,68 +364,75 @@ def add_files(node, file_list):
node.add_file(
files.SlideImageFile(
path=path,
- language=f.get('language', None),
- caption=f.get('caption', ''),
- descriptive_text=f.get('descriptive_text', '')
+ language=f.get("language", None),
+ caption=f.get("caption", ""),
+ descriptive_text=f.get("descriptive_text", ""),
)
)
else:
- raise UnknownFileTypeError('Unrecognized file type "{0}"'.format(f['path']))
+ raise UnknownFileTypeError('Unrecognized file type "{0}"'.format(f["path"]))
def add_questions(exercise_node, question_list):
- EXPECTED_QUESTION_TYPES = [INPUT_QUESTION, MULTIPLE_SELECTION, SINGLE_SELECTION,
- FREE_RESPONSE, PERSEUS_QUESTION]
+ EXPECTED_QUESTION_TYPES = [
+ INPUT_QUESTION,
+ MULTIPLE_SELECTION,
+ SINGLE_SELECTION,
+ FREE_RESPONSE,
+ PERSEUS_QUESTION,
+ ]
for q in question_list:
- question_type = q.get('question_type')
+ question_type = q.get("question_type")
if question_type not in EXPECTED_QUESTION_TYPES:
LOGGER.critical(question_type)
- raise NotImplementedError('Unexpected question type found in channel json.')
+ raise NotImplementedError("Unexpected question type found in channel json.")
- question_text = q.get('question')
- hints = q.get('hints')
+ question_text = q.get("question")
+ hints = q.get("hints")
hints = hints if isinstance(hints, str) else [hint for hint in hints or []]
if question_type == exercises.MULTIPLE_SELECTION:
q_obj = questions.MultipleSelectQuestion(
- id=q['id'],
+ id=q["id"],
question=question_text,
- correct_answers=[answer for answer in q['correct_answers']],
- all_answers=[answer for answer in q['all_answers']],
+ correct_answers=[answer for answer in q["correct_answers"]],
+ all_answers=[answer for answer in q["all_answers"]],
hints=hints,
)
exercise_node.add_question(q_obj)
elif question_type == exercises.SINGLE_SELECTION:
q_obj = questions.SingleSelectQuestion(
- id=q['id'],
+ id=q["id"],
question=question_text,
- correct_answer=q['correct_answer'],
- all_answers=[answer for answer in q['all_answers']],
+ correct_answer=q["correct_answer"],
+ all_answers=[answer for answer in q["all_answers"]],
hints=hints,
)
exercise_node.add_question(q_obj)
elif question_type == exercises.INPUT_QUESTION:
q_obj = questions.InputQuestion(
- id=q['id'],
+ id=q["id"],
question=question_text,
- answers=[answer for answer in q['answers']],
+ answers=[answer for answer in q["answers"]],
hints=hints,
)
exercise_node.add_question(q_obj)
elif question_type == exercises.PERSEUS_QUESTION:
q_obj = questions.PerseusQuestion(
- id=q['id'],
- raw_data=q.get('item_data'),
- source_url=q.get('source_url') or 'https://www.khanacademy.org/',
+ id=q["id"],
+ raw_data=q.get("item_data"),
+ source_url=q.get("source_url") or "https://www.khanacademy.org/",
)
exercise_node.add_question(q_obj)
else:
raise UnknownQuestionTypeError(
- 'Unrecognized question type {0}: accepted types are {1}'.format(question_type, [key for key, value in
- exercises.question_choices]))
+ "Unrecognized question type {0}: accepted types are {1}".format(
+ question_type, [key for key, value in exercises.question_choices]
+ )
+ )
diff --git a/ricecooker/utils/kolibripreview.py b/ricecooker/utils/kolibripreview.py
index c17b55fe..5e312432 100755
--- a/ricecooker/utils/kolibripreview.py
+++ b/ricecooker/utils/kolibripreview.py
@@ -9,9 +9,9 @@ def validate(srcdir):
"""
Check if `srcdir` has an index.html in it.
"""
- indexpath = os.path.join(srcdir, 'index.html')
+ indexpath = os.path.join(srcdir, "index.html")
if not os.path.exists(indexpath):
- print('Missing index.html file in', srcdir)
+ print("Missing index.html file in", srcdir)
return False
return True
@@ -21,20 +21,25 @@ def main(args):
Command line utility for previewing HTML5App content in Kolbri.
"""
if not os.path.exists(args.srcdir) or not os.path.isdir(args.srcdir):
- print('Error:', args.srcdir, 'is not a directory.')
+ print("Error:", args.srcdir, "is not a directory.")
sys.exit(1)
if not validate(args.srcdir):
- print('Validation failed; exiting.')
+ print("Validation failed; exiting.")
sys.exit(2)
# Write the contents of `srcdir` to `destzip`
destzipbase, _ = os.path.splitext(args.destzip)
- shutil.make_archive(destzipbase, 'zip', args.srcdir)
+ shutil.make_archive(destzipbase, "zip", args.srcdir)
-if __name__ == '__main__':
+if __name__ == "__main__":
parser = argparse.ArgumentParser(description=main.__doc__)
- parser.add_argument('--srcdir', help='HTML5 webroot (source directory)', default='.')
- parser.add_argument('--destzip', help='Path to a HTML5 zip file in local Kolibri installation', required=True)
+ parser.add_argument(
+ "--srcdir", help="HTML5 webroot (source directory)", default="."
+ )
+ parser.add_argument(
+ "--destzip",
+ help="Path to a HTML5 zip file in local Kolibri installation",
+ required=True,
+ )
args = parser.parse_args()
main(args)
-
diff --git a/ricecooker/utils/libstudio.py b/ricecooker/utils/libstudio.py
index 587ba9ff..fd5c7787 100644
--- a/ricecooker/utils/libstudio.py
+++ b/ricecooker/utils/libstudio.py
@@ -1,10 +1,11 @@
import requests
+
from ricecooker.config import LOGGER
# DEFAULT_STUDIO_URL = 'https://develop.studio.learningequality.org'
# DEFAULT_STUDIO_URL = 'http://127.0.0.1:8080'
-DEFAULT_STUDIO_URL = 'https://studio.learningequality.org'
+DEFAULT_STUDIO_URL = "https://studio.learningequality.org"
# TODO https://studio.learningequality.org/api/get_node_path/ca8f380/18932/41b2549
@@ -18,8 +19,10 @@ class StudioApi(object):
corrections, and other automation.
"""
- def __init__(self, token, username=None, password=None, studio_url=DEFAULT_STUDIO_URL):
- self.studio_url = studio_url.rstrip('/')
+ def __init__(
+ self, token, username=None, password=None, studio_url=DEFAULT_STUDIO_URL
+ ):
+ self.studio_url = studio_url.rstrip("/")
self.token = token
self.licenses_by_id = self.get_licenses()
if username and password:
@@ -28,10 +31,10 @@ def __init__(self, token, username=None, password=None, studio_url=DEFAULT_STUDI
self.session = None
def _create_logged_in_session(self, username, password):
- LOGIN_ENDPOINT = self.studio_url + '/accounts/login/'
+ LOGIN_ENDPOINT = self.studio_url + "/accounts/login/"
session = requests.session()
session.headers.update({"referer": self.studio_url})
- session.headers.update({'User-Agent': 'Mozilla/5.0 Firefox/63.0'})
+ session.headers.update({"User-Agent": "Mozilla/5.0 Firefox/63.0"})
session.get(LOGIN_ENDPOINT)
csrftoken = session.cookies.get("csrftoken")
session.headers.update({"csrftoken": csrftoken})
@@ -39,13 +42,12 @@ def _create_logged_in_session(self, username, password):
post_data = {
"csrfmiddlewaretoken": csrftoken,
"username": username,
- "password": password
+ "password": password,
}
response2 = session.post(LOGIN_ENDPOINT, data=post_data)
- assert response2.status_code == 200, 'Login POST failed'
+ assert response2.status_code == 200, "Login POST failed"
return session
-
def get_channel(self, channel_id):
"""
Calls the /api/channel/{{channel_id}} endpoint to get the channel info.
@@ -58,44 +60,42 @@ def get_channel(self, channel_id):
created this channel. If `Null` this means it's a manually uploaded
channel or a derivative channel
"""
- CHANNEL_ENDPOINT = self.studio_url + '/api/channel/'
+ CHANNEL_ENDPOINT = self.studio_url + "/api/channel/"
# TODO: add TokenAuth to this entpoint so can use without session login
# headers = {"Authorization": "Token {0}".format(self.token)}
url = CHANNEL_ENDPOINT + channel_id
- LOGGER.info(' GET ' + url)
+ LOGGER.info(" GET " + url)
response = self.session.get(url)
channel_data = response.json()
return channel_data
- def get_channel_root_studio_id(self, channel_id, tree='main'):
+ def get_channel_root_studio_id(self, channel_id, tree="main"):
"""
Return the `studio_id` for the root of the tree `tree` for `channel_id`.
"""
channel_data = self.get_channel(channel_id)
- tree_key = tree + '_tree'
+ tree_key = tree + "_tree"
tree_data = channel_data[tree_key]
- return tree_data['id']
-
+ return tree_data["id"]
def get_licenses(self):
- LICENSES_LIST_ENDPOINT = self.studio_url + '/api/license'
+ LICENSES_LIST_ENDPOINT = self.studio_url + "/api/license"
headers = {"Authorization": "Token {0}".format(self.token)}
response = requests.get(LICENSES_LIST_ENDPOINT, headers=headers)
licenses_list = response.json()
licenses_dict = {}
for license in licenses_list:
- licenses_dict[license['id']] = license
+ licenses_dict[license["id"]] = license
return licenses_dict
-
def get_nodes_by_ids_complete(self, studio_id):
"""
Get the complete JSON representation of a content node from the Studio API.
"""
- NODES_ENDPOINT = self.studio_url + '/api/get_nodes_by_ids_complete/'
+ NODES_ENDPOINT = self.studio_url + "/api/get_nodes_by_ids_complete/"
headers = {"Authorization": "Token {0}".format(self.token)}
url = NODES_ENDPOINT + studio_id
- LOGGER.info(' GET ' + url)
+ LOGGER.info(" GET " + url)
response = requests.get(url, headers=headers)
studio_node = response.json()[0]
return studio_node
@@ -106,20 +106,23 @@ def get_nodes_by_ids_bulk(self, studio_ids):
content node data in chunks of 10 from the Studio API.
"""
CHUNK_SIZE = 25
- NODES_ENDPOINT = self.studio_url + '/api/get_nodes_by_ids_complete/'
+ NODES_ENDPOINT = self.studio_url + "/api/get_nodes_by_ids_complete/"
headers = {"Authorization": "Token {0}".format(self.token)}
studio_nodes = []
- studio_ids_chunks = [studio_ids[i:i+CHUNK_SIZE] for i in range(0, len(studio_ids), CHUNK_SIZE)]
+ studio_ids_chunks = [
+ studio_ids[i : i + CHUNK_SIZE]
+ for i in range(0, len(studio_ids), CHUNK_SIZE)
+ ]
for studio_ids_chunk in studio_ids_chunks:
- studio_ids_csv = ','.join(studio_ids_chunk)
+ studio_ids_csv = ",".join(studio_ids_chunk)
url = NODES_ENDPOINT + studio_ids_csv
- LOGGER.info(' GET ' + url)
+ LOGGER.info(" GET " + url)
response = requests.get(url, headers=headers)
chunk_nodes = response.json()
for chunk_node in chunk_nodes:
- if 'children' in chunk_node:
- child_nodes = self.get_nodes_by_ids_bulk(chunk_node['children'])
- chunk_node['children'] = child_nodes
+ if "children" in chunk_node:
+ child_nodes = self.get_nodes_by_ids_bulk(chunk_node["children"])
+ chunk_node["children"] = child_nodes
studio_nodes.extend(chunk_nodes)
return studio_nodes
@@ -128,13 +131,12 @@ def get_tree_for_studio_id(self, studio_id):
Returns the full json tree (recusive calls to /api/get_nodes_by_ids_complete)
"""
channel_root = self.get_nodes_by_ids_complete(studio_id)
- if 'children' in channel_root:
- children_refs = channel_root['children']
+ if "children" in channel_root:
+ children_refs = channel_root["children"]
studio_nodes = self.get_nodes_by_ids_bulk(children_refs)
- channel_root['children'] = studio_nodes
+ channel_root["children"] = studio_nodes
return channel_root
-
def get_contentnode(self, studio_id):
"""
Return the `studio_id` for the root of the tree `tree` for `channel_id`.
@@ -145,9 +147,11 @@ def put_contentnode(self, data):
"""
Send a PUT requests to /api/contentnode to update Studio node to data.
"""
- CONTENTNODE_ENDPOINT = self.studio_url + '/api/contentnode'
- REQUIRED_FIELDS = ['id', 'tags', 'prerequisite', 'parent']
- assert data_has_required_keys(data, REQUIRED_FIELDS), 'missing necessary attributes'
+ CONTENTNODE_ENDPOINT = self.studio_url + "/api/contentnode"
+ REQUIRED_FIELDS = ["id", "tags", "prerequisite", "parent"]
+ assert data_has_required_keys(
+ data, REQUIRED_FIELDS
+ ), "missing necessary attributes"
# studio_id = data['id']
url = CONTENTNODE_ENDPOINT
# print(' semantic PATCH using PUT ' + url)
@@ -164,16 +168,18 @@ def delete_contentnode(self, data, channel_id, trash_studio_id=None):
can provide `trash_studio_id` which is the studio id the trash tree for
the channel.
"""
- MOVE_NODES_ENDPOINT = self.studio_url + '/api/move_nodes/'
- REQUIRED_FIELDS = ['id']
- assert data_has_required_keys(data, REQUIRED_FIELDS), 'missing necessary attributes'
+ MOVE_NODES_ENDPOINT = self.studio_url + "/api/move_nodes/"
+ REQUIRED_FIELDS = ["id"]
+ assert data_has_required_keys(
+ data, REQUIRED_FIELDS
+ ), "missing necessary attributes"
if trash_studio_id is None:
channel_data = self.get_channel(channel_id)
- trash_studio_id = channel_data['trash_tree']['id']
+ trash_studio_id = channel_data["trash_tree"]["id"]
post_data = {
- 'nodes': [data],
- 'target_parent': trash_studio_id,
- 'channel_id': channel_id,
+ "nodes": [data],
+ "target_parent": trash_studio_id,
+ "channel_id": channel_id,
}
url = MOVE_NODES_ENDPOINT
# print(' semantic DELETE using POST to ' + url)
@@ -188,13 +194,13 @@ def copy_contentnode(self, data, target_parent, channel_id):
Send a POST requests to /api/duplicate_node_inline/ to copy node `data`
to the target parent folder `target_parent` in channel `channel_id`.
"""
- DUPLICATE_NODE_INLINE_ENDPOINT = self.studio_url + '/api/duplicate_nodes/'
- REQUIRED_FIELDS = ['id']
- assert data_has_required_keys(data, REQUIRED_FIELDS), 'no studio_id in data'
+ DUPLICATE_NODE_INLINE_ENDPOINT = self.studio_url + "/api/duplicate_nodes/"
+ REQUIRED_FIELDS = ["id"]
+ assert data_has_required_keys(data, REQUIRED_FIELDS), "no studio_id in data"
post_data = {
- 'node_ids': [data['id']],
- 'target_parent': target_parent,
- 'channel_id': channel_id,
+ "node_ids": [data["id"]],
+ "target_parent": target_parent,
+ "channel_id": channel_id,
}
url = DUPLICATE_NODE_INLINE_ENDPOINT
# print(' semantic COPY using POST to ' + url)
@@ -205,17 +211,9 @@ def copy_contentnode(self, data, target_parent, channel_id):
return copied_data_list
-
def data_has_required_keys(data, required_keys):
verdict = True
for key in required_keys:
if key not in data:
verdict = False
return verdict
-
-
-
-
-
-
-
diff --git a/ricecooker/utils/linecook.py b/ricecooker/utils/linecook.py
index c82b35bc..221c9b66 100644
--- a/ricecooker/utils/linecook.py
+++ b/ricecooker/utils/linecook.py
@@ -1,28 +1,46 @@
import argparse
import os
-from ricecooker.config import LOGGER
from le_utils.constants import content_kinds
-from .metadata_provider import path_to_tuple
-from .jsontrees import (TOPIC_NODE, VIDEO_NODE, AUDIO_NODE, EXERCISE_NODE,
- DOCUMENT_NODE, HTML5_NODE)
-from .jsontrees import (VIDEO_FILE, AUDIO_FILE, DOCUMENT_FILE, EPUB_FILE, HTML5_FILE,
- THUMBNAIL_FILE, SUBTITLES_FILE)
+
+from .jsontrees import AUDIO_FILE
+from .jsontrees import AUDIO_NODE
+from .jsontrees import DOCUMENT_FILE
+from .jsontrees import DOCUMENT_NODE
+from .jsontrees import EPUB_FILE
+from .jsontrees import EXERCISE_NODE
+from .jsontrees import HTML5_FILE
+from .jsontrees import HTML5_NODE
+from .jsontrees import SUBTITLES_FILE
+from .jsontrees import THUMBNAIL_FILE
+from .jsontrees import TOPIC_NODE
+from .jsontrees import VIDEO_FILE
+from .jsontrees import VIDEO_NODE
from .jsontrees import write_tree_to_json_tree
+from .metadata_provider import path_to_tuple
+from ricecooker.config import LOGGER
# LINECOOK CONFIGS
################################################################################
DIR_EXCLUDE_PATTERNS = []
-FILE_EXCLUDE_EXTENTIONS = ['.DS_Store', 'Thumbs.db', 'ehthumbs.db', 'ehthumbs_vista.db', '.gitkeep']
+FILE_EXCLUDE_EXTENTIONS = [
+ ".DS_Store",
+ "Thumbs.db",
+ "ehthumbs.db",
+ "ehthumbs_vista.db",
+ ".gitkeep",
+]
FILE_SKIP_PATTENRS = []
-FILE_SKIP_THUMBNAILS = [] # global list of paths that correspond to thumbails for other content nodes
-
+FILE_SKIP_THUMBNAILS = (
+ []
+) # global list of paths that correspond to thumbails for other content nodes
# LINECOOK HELPER FUNCTIONS
################################################################################
+
def chan_path_from_rel_path(rel_path, channeldir):
"""
Convert `rel_path` form os.walk tuple format to a tuple of directories and
@@ -36,13 +54,14 @@ def chan_path_from_rel_path(rel_path, channeldir):
dirs_before_channeldir = channeldir.split(os.path.sep)[:-1]
channel_chan_path = [] # path relative to channel root, inclusive
for idx, part in enumerate(rel_path_parts):
- if idx < len(dirs_before_channeldir) and dirs_before_channeldir[idx]==part:
+ if idx < len(dirs_before_channeldir) and dirs_before_channeldir[idx] == part:
continue
else:
channel_chan_path.append(part)
chan_path = os.path.join(*channel_chan_path)
return chan_path
+
def rel_path_from_chan_path(chan_path, channeldir, windows=False):
"""
Convert `chan_path` as obtained from a metadata provider into a `rel_path`
@@ -51,35 +70,41 @@ def rel_path_from_chan_path(chan_path, channeldir, windows=False):
'content/open_stax_zip/Open Stax/Math'
"""
if windows:
- chan_path_list = chan_path.split('\\')
+ chan_path_list = chan_path.split("\\")
else:
- chan_path_list = chan_path.split('/')
+ chan_path_list = chan_path.split("/")
chan_path_list.pop(0) # remove the channel root dir
rel_path = os.path.join(channeldir, *chan_path_list)
return rel_path
+
def get_topic_for_path(channel, chan_path_tuple):
"""
Given channel (dict) that contains a hierary of TopicNode dicts, we use the
walk the path given in `chan_path_tuple` to find the corresponding TopicNode.
"""
- assert chan_path_tuple[0] == channel['dirname'], 'Wrong channeldir'
+ assert chan_path_tuple[0] == channel["dirname"], "Wrong channeldir"
chan_path_list = list(chan_path_tuple)
- chan_path_list.pop(0) # skip the channel name
+ chan_path_list.pop(0) # skip the channel name
if len(chan_path_list) == 0:
return channel
current = channel
for subtopic in chan_path_list:
- current = list(filter(lambda d: 'dirname' in d and d['dirname'] == subtopic, current['children']))[0]
+ current = list(
+ filter(
+ lambda d: "dirname" in d and d["dirname"] == subtopic,
+ current["children"],
+ )
+ )[0]
return current
-
# LINECOOK BUILD JSON TREE
################################################################################
+
def filter_filenames(filenames):
"""
Skip files with extentions in `FILE_EXCLUDE_EXTENTIONS` and filenames that
@@ -91,19 +116,27 @@ def filter_filenames(filenames):
for pattern in FILE_EXCLUDE_EXTENTIONS:
if filename.endswith(pattern):
keep = False
- for pattern in FILE_SKIP_PATTENRS: # This will reject exercises...
+ for pattern in FILE_SKIP_PATTENRS: # This will reject exercises...
if pattern in filename:
keep = False
if keep:
filenames_cleaned.append(filename)
return filenames_cleaned
+
def filter_thumbnail_files(chan_path, filenames, metadata_provider):
"""
We don't want to create `ContentNode` from thumbnail files.
"""
- thumbnail_files_to_skip = set(os.path.join(*p) for p in metadata_provider.get_thumbnail_paths())
- return [filename for filename in filenames if os.path.join(chan_path, filename) not in thumbnail_files_to_skip]
+ thumbnail_files_to_skip = set(
+ os.path.join(*p) for p in metadata_provider.get_thumbnail_paths()
+ )
+ return [
+ filename
+ for filename in filenames
+ if os.path.join(chan_path, filename) not in thumbnail_files_to_skip
+ ]
+
def keep_folder(raw_path):
"""
@@ -112,78 +145,93 @@ def keep_folder(raw_path):
keep = True
for pattern in DIR_EXCLUDE_PATTERNS:
if pattern in raw_path:
- LOGGER.debug('rejecting', raw_path)
+ LOGGER.debug("rejecting", raw_path)
keep = False
return keep
+
def process_folder(channel, rel_path, filenames, metadata_provider):
"""
Create `ContentNode`s from each file in this folder and the node to `channel`
under the path `rel_path`.
"""
- LOGGER.debug('IN process_folder ' + str(rel_path) + ' ' + str(filenames))
+ LOGGER.debug("IN process_folder " + str(rel_path) + " " + str(filenames))
if not keep_folder(rel_path):
return
chan_path = chan_path_from_rel_path(rel_path, metadata_provider.channeldir)
chan_path_tuple = path_to_tuple(chan_path)
chan_path_list = list(chan_path_tuple)
- LOGGER.debug('chan_path_list=' + str(chan_path_list))
+ LOGGER.debug("chan_path_list=" + str(chan_path_list))
# FIND THE CONTAINING NODE (channel or topic)
if len(chan_path_list) == 1:
# CASE CHANNEL ROOT: `rel_path` points to `channeldir`
# No need to create a topic node here since channel already exists
- containing_node = channel # attach content nodes in filenames directly to channel
+ containing_node = (
+ channel # attach content nodes in filenames directly to channel
+ )
else:
# CASE TOPIC FOLDER: `rel_path` points to a channelroot subfolder (a.k.a TopicNode)
- dirname = chan_path_list.pop() # name of the folder (used as ID for internal lookup)
+ dirname = (
+ chan_path_list.pop()
+ ) # name of the folder (used as ID for internal lookup)
topic_parent_node = get_topic_for_path(channel, chan_path_list)
# read topic metadata to get title and description for the TopicNode
topic_metadata = metadata_provider.get(chan_path_tuple)
- thumbnail_chan_path = topic_metadata.get('thumbnail_chan_path', None)
+ thumbnail_chan_path = topic_metadata.get("thumbnail_chan_path", None)
if thumbnail_chan_path:
- thumbnail_rel_path = rel_path_from_chan_path(thumbnail_chan_path, metadata_provider.channeldir)
+ thumbnail_rel_path = rel_path_from_chan_path(
+ thumbnail_chan_path, metadata_provider.channeldir
+ )
else:
thumbnail_rel_path = None
# create TopicNode for this folder
topic = dict(
kind=TOPIC_NODE,
dirname=dirname,
- source_id='sourceid:' + rel_path,
- title=topic_metadata.get('title', dirname),
- description=topic_metadata.get('description', None),
- author=topic_metadata.get('author', None),
- language=topic_metadata.get('language', None),
- license=topic_metadata.get('license', None),
+ source_id="sourceid:" + rel_path,
+ title=topic_metadata.get("title", dirname),
+ description=topic_metadata.get("description", None),
+ author=topic_metadata.get("author", None),
+ language=topic_metadata.get("language", None),
+ license=topic_metadata.get("license", None),
thumbnail=thumbnail_rel_path,
children=[],
)
- topic_parent_node['children'].append(topic)
- containing_node = topic # attach content nodes in filenames to the newly created topic
+ topic_parent_node["children"].append(topic)
+ containing_node = (
+ topic # attach content nodes in filenames to the newly created topic
+ )
# filter filenames
filenames_cleaned = filter_filenames(filenames)
- filenames_cleaned2 = filter_thumbnail_files(chan_path, filenames_cleaned, metadata_provider)
+ filenames_cleaned2 = filter_thumbnail_files(
+ chan_path, filenames_cleaned, metadata_provider
+ )
# PROCESS FILES
for filename in filenames_cleaned2:
chan_filepath = os.path.join(chan_path, filename)
chan_filepath_tuple = path_to_tuple(chan_filepath)
metadata = metadata_provider.get(chan_filepath_tuple)
- node = make_content_node(metadata_provider.channeldir, rel_path, filename, metadata)
- containing_node['children'].append(node) # attach content node to containing_node
+ node = make_content_node(
+ metadata_provider.channeldir, rel_path, filename, metadata
+ )
+ containing_node["children"].append(
+ node
+ ) # attach content node to containing_node
def build_ricecooker_json_tree(args, options, metadata_provider, json_tree_path):
"""
Download all categories, subpages, modules, and resources from open.edu.
"""
- LOGGER.info('Starting to build the ricecooker_json_tree')
+ LOGGER.info("Starting to build the ricecooker_json_tree")
- channeldir = args['channeldir']
+ channeldir = args["channeldir"]
if channeldir.endswith(os.path.sep):
channeldir.rstrip(os.path.sep)
channelparentdir, channeldirname = os.path.split(channeldir)
@@ -191,23 +239,25 @@ def build_ricecooker_json_tree(args, options, metadata_provider, json_tree_path)
# Ricecooker tree
channel_info = metadata_provider.get_channel_info()
- thumbnail_chan_path = channel_info.get('thumbnail_chan_path', None)
+ thumbnail_chan_path = channel_info.get("thumbnail_chan_path", None)
if thumbnail_chan_path:
- thumbnail_rel_path = rel_path_from_chan_path(thumbnail_chan_path, metadata_provider.channeldir)
+ thumbnail_rel_path = rel_path_from_chan_path(
+ thumbnail_chan_path, metadata_provider.channeldir
+ )
else:
thumbnail_rel_path = None
ricecooker_json_tree = dict(
dirname=channeldirname,
- title=channel_info['title'],
- description=channel_info['description'],
- source_domain=channel_info['source_domain'],
- source_id=channel_info['source_id'],
- language=channel_info['language'],
+ title=channel_info["title"],
+ description=channel_info["description"],
+ source_domain=channel_info["source_domain"],
+ source_id=channel_info["source_id"],
+ language=channel_info["language"],
thumbnail=thumbnail_rel_path,
children=[],
)
- channeldir = args['channeldir']
+ channeldir = args["channeldir"]
content_folders = sorted(os.walk(channeldir))
# MAIN PROCESSING OF os.walk OUTPUT
@@ -215,7 +265,7 @@ def build_ricecooker_json_tree(args, options, metadata_provider, json_tree_path)
# TODO(ivan): figure out all the implications of the
# _ = content_folders.pop(0) # Skip over channel folder because handled above
for rel_path, _subfolders, filenames in content_folders:
- LOGGER.info('processing folder ' + str(rel_path))
+ LOGGER.info("processing folder " + str(rel_path))
# IMPLEMENTATION DETAIL:
# - `filenames` contains real files in the `channeldir` folder
@@ -223,17 +273,23 @@ def build_ricecooker_json_tree(args, options, metadata_provider, json_tree_path)
# order of nodes within a given topic. Since alphabetical order is used to
# walk the files in the `channeldir`, we must "splice in" the exercises here
if metadata_provider.has_exercises():
- dir_chan_path = chan_path_from_rel_path(rel_path, metadata_provider.channeldir)
+ dir_chan_path = chan_path_from_rel_path(
+ rel_path, metadata_provider.channeldir
+ )
dir_path_tuple = path_to_tuple(dir_chan_path)
- exercises_filenames = metadata_provider.get_exercises_for_dir(dir_path_tuple)
+ exercises_filenames = metadata_provider.get_exercises_for_dir(
+ dir_path_tuple
+ )
filenames.extend(exercises_filenames)
sorted_filenames = sorted(filenames)
- process_folder(ricecooker_json_tree, rel_path, sorted_filenames, metadata_provider)
+ process_folder(
+ ricecooker_json_tree, rel_path, sorted_filenames, metadata_provider
+ )
# Write out ricecooker_json_tree.json
write_tree_to_json_tree(json_tree_path, ricecooker_json_tree)
- LOGGER.info('Folder hierarchy walk result stored in ' + json_tree_path)
+ LOGGER.info("Folder hierarchy walk result stored in " + json_tree_path)
def make_content_node(channeldir, rel_path, filename, metadata):
@@ -244,24 +300,28 @@ def make_content_node(channeldir, rel_path, filename, metadata):
ext = file_ext[1:]
kind = None
if ext in content_kinds.MAPPING:
- kind = content_kinds.MAPPING[ext] # guess what kind based on file extension
- elif 'questions' in metadata:
+ kind = content_kinds.MAPPING[ext] # guess what kind based on file extension
+ elif "questions" in metadata:
kind = content_kinds.EXERCISE
else:
- raise ValueError('Could not find kind for extension ' + str(ext) + ' in content_kinds.MAPPING')
+ raise ValueError(
+ "Could not find kind for extension "
+ + str(ext)
+ + " in content_kinds.MAPPING"
+ )
# Extract metadata fields
- source_id = metadata.get('source_id', None)
+ source_id = metadata.get("source_id", None)
if source_id is None:
- source_id = metadata['chan_path']
+ source_id = metadata["chan_path"]
filepath = os.path.join(rel_path, filename)
- title = metadata['title']
- description = metadata.get('description', None)
- author = metadata.get('author', None)
- lang = metadata.get('language', None)
- license_dict = metadata.get('license', None)
- thumbnail_chan_path = metadata.get('thumbnail_chan_path', None)
+ title = metadata["title"]
+ description = metadata.get("description", None)
+ author = metadata.get("author", None)
+ lang = metadata.get("language", None)
+ license_dict = metadata.get("license", None)
+ thumbnail_chan_path = metadata.get("thumbnail_chan_path", None)
if thumbnail_chan_path:
thumbnail_rel_path = rel_path_from_chan_path(thumbnail_chan_path, channeldir)
else:
@@ -278,7 +338,9 @@ def make_content_node(channeldir, rel_path, filename, metadata):
license=license_dict,
derive_thumbnail=True,
thumbnail=thumbnail_rel_path,
- files=[{'file_type':VIDEO_FILE, 'path':filepath, 'language':lang}], # ffmpeg_settings={"crf": 24},
+ files=[
+ {"file_type": VIDEO_FILE, "path": filepath, "language": lang}
+ ], # ffmpeg_settings={"crf": 24},
)
elif kind == AUDIO_NODE:
@@ -292,7 +354,7 @@ def make_content_node(channeldir, rel_path, filename, metadata):
license=license_dict,
thumbnail=thumbnail_rel_path,
derive_thumbnail=True,
- files=[{'file_type':AUDIO_FILE, 'path':filepath, 'language':lang}],
+ files=[{"file_type": AUDIO_FILE, "path": filepath, "language": lang}],
)
elif kind == DOCUMENT_NODE:
@@ -306,24 +368,16 @@ def make_content_node(channeldir, rel_path, filename, metadata):
license=license_dict,
thumbnail=thumbnail_rel_path,
derive_thumbnail=True,
- files=[]
+ files=[],
)
- if ext == 'pdf':
- pdf_file = {
- 'file_type':DOCUMENT_FILE,
- 'path':filepath,
- 'language':lang
- }
- content_node['files'].append(pdf_file)
- elif ext == 'epub':
- epub_file = {
- 'file_type':EPUB_FILE,
- 'path':filepath,
- 'language':lang
- }
- content_node['files'].append(epub_file)
+ if ext == "pdf":
+ pdf_file = {"file_type": DOCUMENT_FILE, "path": filepath, "language": lang}
+ content_node["files"].append(pdf_file)
+ elif ext == "epub":
+ epub_file = {"file_type": EPUB_FILE, "path": filepath, "language": lang}
+ content_node["files"].append(epub_file)
else:
- raise ValueError('Ext {} not supported for kind {}'.format(ext, kind))
+ raise ValueError("Ext {} not supported for kind {}".format(ext, kind))
elif kind == HTML5_NODE:
content_node = dict(
@@ -336,7 +390,7 @@ def make_content_node(channeldir, rel_path, filename, metadata):
license=license_dict,
thumbnail=thumbnail_rel_path,
derive_thumbnail=True,
- files=[{'file_type':HTML5_FILE, 'path':filepath, 'language':lang}],
+ files=[{"file_type": HTML5_FILE, "path": filepath, "language": lang}],
)
elif kind == EXERCISE_NODE:
@@ -348,34 +402,36 @@ def make_content_node(channeldir, rel_path, filename, metadata):
description=description,
language=lang,
license=license_dict,
- exercise_data=metadata['exercise_data'],
- questions=metadata['questions'],
+ exercise_data=metadata["exercise_data"],
+ questions=metadata["questions"],
thumbnail=thumbnail_rel_path,
derive_thumbnail=False,
files=[],
)
else:
- raise ValueError('Not implemented case for kind ' + str(kind))
+ raise ValueError("Not implemented case for kind " + str(kind))
return content_node
-
# AUTOMATIC REMOVAL OF TRAILING SLASHES FOR chenneldir
################################################################################
+
class NonFolderError(Exception):
pass
+
class FolderExistsAction(argparse.Action):
"""
Custom argparse action: verify the argument to be a folder (directory).
The action will strip off trailing slashes from the folder's name.
"""
+
def verify_folder_existence(self, folder_name):
if not os.path.isdir(folder_name):
- message = 'ERROR: {0} is not a folder'.format(folder_name)
+ message = "ERROR: {0} is not a folder".format(folder_name)
raise NonFolderError(message)
folder_name = folder_name.rstrip(os.sep)
return folder_name
diff --git a/ricecooker/utils/metadata_provider.py b/ricecooker/utils/metadata_provider.py
index 179c6c61..ba34de2e 100644
--- a/ricecooker/utils/metadata_provider.py
+++ b/ricecooker/utils/metadata_provider.py
@@ -1,51 +1,52 @@
-from collections import defaultdict
import csv
import json
import os
import re
-import requests
+from collections import defaultdict
from unicodedata import normalize
-from le_utils.constants import content_kinds, exercises
-from ricecooker.config import LOGGER
-from ricecooker.utils.libstudio import StudioApi
+import requests
+from le_utils.constants import content_kinds
+from le_utils.constants import exercises
from ricecooker.classes.questions import MARKDOWN_IMAGE_REGEX
+from ricecooker.config import LOGGER
+from ricecooker.utils.libstudio import StudioApi
# CONSTANTS
################################################################################
-DEFAULT_EXTRA_ITEMS_SEPARATOR = '🍣' # used to separate list-like data in CSV
-CSV_STR_TRUE_VALUES = ['on', 'yes', '1', 'true']
-CSV_STR_FALSE_VALUES = ['off', 'no', '0', 'false']
-
-DEFAULT_CHANNEL_INFO_FILENAME = 'Channel.csv'
-CHANNEL_TITLE_KEY = 'Title'
-CHANNEL_DESCRIPTION_KEY = 'Description'
-CHANNEL_DOMAIN_KEY = 'Domain'
-CHANNEL_SOURCEID_KEY = 'Source ID'
-CHANNEL_LANGUAGE_KEY = 'Language'
-CHANNEL_THUMBNAIL_KEY = 'Thumbnail'
+DEFAULT_EXTRA_ITEMS_SEPARATOR = "🍣" # used to separate list-like data in CSV
+CSV_STR_TRUE_VALUES = ["on", "yes", "1", "true"]
+CSV_STR_FALSE_VALUES = ["off", "no", "0", "false"]
+
+DEFAULT_CHANNEL_INFO_FILENAME = "Channel.csv"
+CHANNEL_TITLE_KEY = "Title"
+CHANNEL_DESCRIPTION_KEY = "Description"
+CHANNEL_DOMAIN_KEY = "Domain"
+CHANNEL_SOURCEID_KEY = "Source ID"
+CHANNEL_LANGUAGE_KEY = "Language"
+CHANNEL_THUMBNAIL_KEY = "Thumbnail"
CHANNEL_INFO_HEADER = [
CHANNEL_TITLE_KEY,
CHANNEL_DESCRIPTION_KEY,
CHANNEL_DOMAIN_KEY,
CHANNEL_SOURCEID_KEY,
CHANNEL_LANGUAGE_KEY,
- CHANNEL_THUMBNAIL_KEY
+ CHANNEL_THUMBNAIL_KEY,
]
-DEFAULT_CONTENT_INFO_FILENAME = 'Content.csv'
-CONTENT_PATH_KEY = 'Path *'
-CONTENT_TITLE_KEY = 'Title *'
-CONTENT_SOURCEID_KEY = 'Source ID'
-CONTENT_DESCRIPTION_KEY = 'Description'
-CONTENT_AUTHOR_KEY = 'Author'
-CONTENT_LANGUAGE_KEY = 'Language'
-CONTENT_LICENSE_ID_KEY = 'License ID *'
-CONTENT_LICENSE_DESCRIPTION_KEY = 'License Description'
-CONTENT_LICENSE_COPYRIGHT_HOLDER_KEY = 'Copyright Holder'
-CONTENT_THUMBNAIL_KEY = 'Thumbnail'
+DEFAULT_CONTENT_INFO_FILENAME = "Content.csv"
+CONTENT_PATH_KEY = "Path *"
+CONTENT_TITLE_KEY = "Title *"
+CONTENT_SOURCEID_KEY = "Source ID"
+CONTENT_DESCRIPTION_KEY = "Description"
+CONTENT_AUTHOR_KEY = "Author"
+CONTENT_LANGUAGE_KEY = "Language"
+CONTENT_LICENSE_ID_KEY = "License ID *"
+CONTENT_LICENSE_DESCRIPTION_KEY = "License Description"
+CONTENT_LICENSE_COPYRIGHT_HOLDER_KEY = "Copyright Holder"
+CONTENT_THUMBNAIL_KEY = "Thumbnail"
CONTENT_INFO_HEADER = [
CONTENT_PATH_KEY,
CONTENT_TITLE_KEY,
@@ -56,14 +57,14 @@
CONTENT_LICENSE_ID_KEY,
CONTENT_LICENSE_DESCRIPTION_KEY,
CONTENT_LICENSE_COPYRIGHT_HOLDER_KEY,
- CONTENT_THUMBNAIL_KEY
+ CONTENT_THUMBNAIL_KEY,
]
-DEFAULT_EXERCISES_INFO_FILENAME = 'Exercises.csv'
-EXERCISE_SOURCEID_KEY = 'Source ID *'
-EXERCISE_M_KEY = 'Number Correct' # (integer)
-EXERCISE_N_KEY = 'Out of Total' # (integer)
-EXERCISE_RANDOMIZE_KEY = 'Randomize' # Use 'true' (default) or 'false'
+DEFAULT_EXERCISES_INFO_FILENAME = "Exercises.csv"
+EXERCISE_SOURCEID_KEY = "Source ID *"
+EXERCISE_M_KEY = "Number Correct" # (integer)
+EXERCISE_N_KEY = "Out of Total" # (integer)
+EXERCISE_RANDOMIZE_KEY = "Randomize" # Use 'true' (default) or 'false'
EXERCISE_INFO_HEADER = [
CONTENT_PATH_KEY,
CONTENT_TITLE_KEY,
@@ -77,30 +78,38 @@
EXERCISE_M_KEY,
EXERCISE_N_KEY,
EXERCISE_RANDOMIZE_KEY,
- CONTENT_THUMBNAIL_KEY
+ CONTENT_THUMBNAIL_KEY,
]
-DEFAULT_EXERCISE_QUESTIONS_INFO_FILENAME = 'ExerciseQuestions.csv'
-EXERCISE_QUESTIONS_QUESTIONID_KEY = 'Question ID *' # unique idendifier for this question
-EXERCISE_QUESTIONS_TYPE_KEY = 'Question type *' # one of ['SingleSelectQuestion', 'MultipleSelectQuestion', 'InputQuestion']
-EXERCISE_QUESTIONS_QUESTION_KEY = 'Question *' # string that contains the question setup and the prompt
-EXERCISE_QUESTIONS_OPTION_A_KEY = 'Option A'
-EXERCISE_QUESTIONS_OPTION_B_KEY = 'Option B'
-EXERCISE_QUESTIONS_OPTION_C_KEY = 'Option C'
-EXERCISE_QUESTIONS_OPTION_D_KEY = 'Option D'
-EXERCISE_QUESTIONS_OPTION_E_KEY = 'Option E'
-EXERCISE_QUESTIONS_OPTION_FGHI_KEY = 'Options F...' # This field can contain a list of multiple '🍣'-separated string values,
- # e.g., 'Anser F🍣Answer G🍣Answer H' (or other suitable unicode character)
-EXERCISE_QUESTIONS_CORRECT_ANSWER_KEY = 'Correct Answer *' # A string that equals one of the options strings
-EXERCISE_QUESTIONS_CORRECT_ANSWER2_KEY = 'Correct Answer 2' # (for multiple select)
-EXERCISE_QUESTIONS_CORRECT_ANSWER3_KEY = 'Correct Answer 3' # (for multiple select)
-EXERCISE_QUESTIONS_HINT_1_KEY = 'Hint 1'
-EXERCISE_QUESTIONS_HINT_2_KEY = 'Hint 2'
-EXERCISE_QUESTIONS_HINT_3_KEY = 'Hint 3'
-EXERCISE_QUESTIONS_HINT_4_KEY = 'Hint 4'
-EXERCISE_QUESTIONS_HINT_5_KEY = 'Hint 5'
-EXERCISE_QUESTIONS_HINT_6789_KEY = 'Hint 6+' # This field can contain a list of multiple '🍣'-separated string values,
- # e.g., 'Hint 6 text🍣Hint 7 text🍣Hing 8 text'
+DEFAULT_EXERCISE_QUESTIONS_INFO_FILENAME = "ExerciseQuestions.csv"
+EXERCISE_QUESTIONS_QUESTIONID_KEY = (
+ "Question ID *" # unique idendifier for this question
+)
+EXERCISE_QUESTIONS_TYPE_KEY = "Question type *" # one of ['SingleSelectQuestion', 'MultipleSelectQuestion', 'InputQuestion']
+EXERCISE_QUESTIONS_QUESTION_KEY = (
+ "Question *" # string that contains the question setup and the prompt
+)
+EXERCISE_QUESTIONS_OPTION_A_KEY = "Option A"
+EXERCISE_QUESTIONS_OPTION_B_KEY = "Option B"
+EXERCISE_QUESTIONS_OPTION_C_KEY = "Option C"
+EXERCISE_QUESTIONS_OPTION_D_KEY = "Option D"
+EXERCISE_QUESTIONS_OPTION_E_KEY = "Option E"
+EXERCISE_QUESTIONS_OPTION_FGHI_KEY = "Options F..." # This field can contain a list of multiple '🍣'-separated string values,
+# e.g., 'Anser F🍣Answer G🍣Answer H' (or other suitable unicode character)
+EXERCISE_QUESTIONS_CORRECT_ANSWER_KEY = (
+ "Correct Answer *" # A string that equals one of the options strings
+)
+EXERCISE_QUESTIONS_CORRECT_ANSWER2_KEY = "Correct Answer 2" # (for multiple select)
+EXERCISE_QUESTIONS_CORRECT_ANSWER3_KEY = "Correct Answer 3" # (for multiple select)
+EXERCISE_QUESTIONS_HINT_1_KEY = "Hint 1"
+EXERCISE_QUESTIONS_HINT_2_KEY = "Hint 2"
+EXERCISE_QUESTIONS_HINT_3_KEY = "Hint 3"
+EXERCISE_QUESTIONS_HINT_4_KEY = "Hint 4"
+EXERCISE_QUESTIONS_HINT_5_KEY = "Hint 5"
+EXERCISE_QUESTIONS_HINT_6789_KEY = (
+ "Hint 6+" # This field can contain a list of multiple '🍣'-separated string values,
+)
+# e.g., 'Hint 6 text🍣Hint 7 text🍣Hing 8 text'
EXERCISE_QUESTIONS_INFO_HEADER = [
EXERCISE_SOURCEID_KEY,
EXERCISE_QUESTIONS_QUESTIONID_KEY,
@@ -120,13 +129,14 @@
EXERCISE_QUESTIONS_HINT_3_KEY,
EXERCISE_QUESTIONS_HINT_4_KEY,
EXERCISE_QUESTIONS_HINT_5_KEY,
- EXERCISE_QUESTIONS_HINT_6789_KEY
+ EXERCISE_QUESTIONS_HINT_6789_KEY,
]
# HELPER FUNCTIONS
################################################################################
+
def path_to_tuple(path):
"""
Split a current file system path into individual parts and form a tuple for key lookups.
@@ -146,7 +156,7 @@ def path_to_tuple(path):
# Normalize UTF-8 encoding to consistent form so cache lookups will work, see
# https://docs.python.org/3.6/library/unicodedata.html#unicodedata.normalize
- path_tup = tuple(normalize('NFD', part) for part in allparts)
+ path_tup = tuple(normalize("NFD", part) for part in allparts)
return path_tup
@@ -155,15 +165,16 @@ def input_path_to_tuple(path, windows=False):
Split `chan_path` into individual parts and form a tuple (used as key).
"""
if windows:
- path_tup = tuple(path.split('\\'))
+ path_tup = tuple(path.split("\\"))
else:
- path_tup = tuple(path.split('/'))
+ path_tup = tuple(path.split("/"))
#
# Normalize UTF-8 encoding to consistent form so cache lookups will work, see
# https://docs.python.org/3.6/library/unicodedata.html#unicodedata.normalize
- path_tup = tuple(normalize('NFD', part) for part in path_tup)
+ path_tup = tuple(normalize("NFD", part) for part in path_tup)
return path_tup
+
def get_metadata_file_path(channeldir, filename):
"""
Return the path to the metadata file named `filename` that is a sibling of `channeldir`.
@@ -172,10 +183,10 @@ def get_metadata_file_path(channeldir, filename):
return os.path.join(channelparentdir, filename)
-
# METADATA PROVIDER BASE CLASS
################################################################################
+
class MetadataProvider(object):
def validate(self):
"""Check if metadata provided is valid."""
@@ -183,13 +194,16 @@ def validate(self):
class CsvMetadataProvider(MetadataProvider):
-
- def __init__(self, channeldir,
- channelinfo=DEFAULT_CHANNEL_INFO_FILENAME,
- contentinfo=DEFAULT_CONTENT_INFO_FILENAME,
- exercisesinfo=DEFAULT_EXERCISES_INFO_FILENAME,
- questionsinfo=DEFAULT_EXERCISE_QUESTIONS_INFO_FILENAME,
- winpaths=False, validate_and_cache=True):
+ def __init__(
+ self,
+ channeldir,
+ channelinfo=DEFAULT_CHANNEL_INFO_FILENAME,
+ contentinfo=DEFAULT_CONTENT_INFO_FILENAME,
+ exercisesinfo=DEFAULT_EXERCISES_INFO_FILENAME,
+ questionsinfo=DEFAULT_EXERCISE_QUESTIONS_INFO_FILENAME,
+ winpaths=False,
+ validate_and_cache=True,
+ ):
"""
Load the metadata from CSV files `channelinfo`, `contentinfo`, and optionally
exericies data from `exercisesinfo` and `questionsinfo` files.
@@ -203,14 +217,16 @@ def __init__(self, channeldir,
self.contentinfo = contentinfo
self.exercisesinfo = exercisesinfo
self.questionsinfo = questionsinfo
- self.contentcache = {} # { ('chan', 'path','as','tuple's) --> node metadata dict
- self.exercise_filenames_in_dir = defaultdict(list) # { ('chan', 'path','some','dir) --> list of exercises (virtual filenames)
+ self.contentcache = (
+ {}
+ ) # { ('chan', 'path','as','tuple's) --> node metadata dict
+ self.exercise_filenames_in_dir = defaultdict(
+ list
+ ) # { ('chan', 'path','some','dir) --> list of exercises (virtual filenames)
self.winpaths = winpaths # paths separator in .csv is windows '\'
if validate_and_cache:
self.validate_headers()
- self.cache_contentinfo() # read and parse CSV to build cache lookup table
-
-
+ self.cache_contentinfo() # read and parse CSV to build cache lookup table
# MAIN METHODS
############################################################################
@@ -227,7 +243,9 @@ def cache_contentinfo(self):
dict_reader = csv.DictReader(csv_lines)
for row in dict_reader:
row_dict = self._map_content_row_to_dict(row)
- path_tuple = input_path_to_tuple(row_dict['chan_path'], windows=self.winpaths)
+ path_tuple = input_path_to_tuple(
+ row_dict["chan_path"], windows=self.winpaths
+ )
self.contentcache[path_tuple] = row_dict
# Additional handling of data in Exercises.csv and ExerciseQuestions.txt
@@ -239,8 +257,8 @@ def cache_contentinfo(self):
dict_reader = csv.DictReader(csv_lines)
for question_row in dict_reader:
question_dict = self._map_exercise_question_row_to_dict(question_row)
- question_source_id = question_dict['source_id']
- del question_dict['source_id']
+ question_source_id = question_dict["source_id"]
+ del question_dict["source_id"]
questions_by_source_id[question_source_id].append(question_dict)
# B. Load exercises
@@ -249,9 +267,11 @@ def cache_contentinfo(self):
dict_reader = csv.DictReader(csv_lines)
for exercise_row in dict_reader:
exercise_dict = self._map_exercise_row_to_dict(exercise_row)
- path_tuple = input_path_to_tuple(exercise_dict['chan_path'], windows=self.winpaths)
- question_source_id = exercise_dict['source_id']
- exercise_dict['questions'] = questions_by_source_id[question_source_id]
+ path_tuple = input_path_to_tuple(
+ exercise_dict["chan_path"], windows=self.winpaths
+ )
+ question_source_id = exercise_dict["source_id"]
+ exercise_dict["questions"] = questions_by_source_id[question_source_id]
# B1: exercises are standard content nodes, so add to contentcache
self.contentcache[path_tuple] = exercise_dict
# B2: add exercise to list of virtual filanames for current folder
@@ -268,10 +288,10 @@ def get(self, path_tuple):
else:
# TODO: make chef robust to missing metadata
# LOGGER.error(
- LOGGER.warning('No metadata found for path_tuple ' + str(path_tuple))
+ LOGGER.warning("No metadata found for path_tuple " + str(path_tuple))
metadata = dict(
filepath=os.path.sep.join(path_tuple),
- title=os.path.sep.join(path_tuple)
+ title=os.path.sep.join(path_tuple),
)
return metadata
@@ -279,13 +299,15 @@ def get_channel_info(self):
"""
Returns the first data row from Channel.csv
"""
- csv_filename = get_metadata_file_path(channeldir=self.channeldir, filename=self.channelinfo)
+ csv_filename = get_metadata_file_path(
+ channeldir=self.channeldir, filename=self.channelinfo
+ )
csv_lines = _read_csv_lines(csv_filename)
dict_reader = csv.DictReader(csv_lines)
- channel_csvs_list = list(dict_reader)
+ channel_csvs_list = list(dict_reader)
channel_csv = channel_csvs_list[0]
if len(channel_csvs_list) > 1:
- raise ValueError('Found multiple channel rows in ' + self.channelinfo)
+ raise ValueError("Found multiple channel rows in " + self.channelinfo)
channel_cleaned = _clean_dict(channel_csv)
channel_info = self._map_channel_row_to_dict(channel_cleaned)
return channel_info
@@ -297,20 +319,22 @@ def get_thumbnail_paths(self):
thumbnail_path_tuples = []
# channel thumbnail
channel_info = self.get_channel_info()
- chthumbnail_path = channel_info.get('thumbnail_chan_path', None)
+ chthumbnail_path = channel_info.get("thumbnail_chan_path", None)
if chthumbnail_path:
- chthumbnail_path_tuple = input_path_to_tuple(chthumbnail_path, windows=self.winpaths)
+ chthumbnail_path_tuple = input_path_to_tuple(
+ chthumbnail_path, windows=self.winpaths
+ )
thumbnail_path_tuples.append(chthumbnail_path_tuple)
# content thumbnails
for content_file_path_tuple, row in self.contentcache.items():
- thumbnail_path = row.get('thumbnail_chan_path', None)
+ thumbnail_path = row.get("thumbnail_chan_path", None)
if thumbnail_path:
- thumbnail_path_tuple = input_path_to_tuple(thumbnail_path, windows=self.winpaths)
+ thumbnail_path_tuple = input_path_to_tuple(
+ thumbnail_path, windows=self.winpaths
+ )
thumbnail_path_tuples.append(thumbnail_path_tuple)
return thumbnail_path_tuples
-
-
# CHANNEL+CONTENT PARSING METHODS
############################################################################
@@ -326,7 +350,7 @@ def _map_channel_row_to_dict(self, row):
source_domain=channel_cleaned[CHANNEL_DOMAIN_KEY],
source_id=channel_cleaned[CHANNEL_SOURCEID_KEY],
language=channel_cleaned[CHANNEL_LANGUAGE_KEY],
- thumbnail_chan_path=channel_cleaned[CHANNEL_THUMBNAIL_KEY]
+ thumbnail_chan_path=channel_cleaned[CHANNEL_THUMBNAIL_KEY],
)
return channel_dict
@@ -341,7 +365,9 @@ def _map_content_row_to_dict(self, row):
license_dict = dict(
license_id=row_cleaned[CONTENT_LICENSE_ID_KEY],
description=row_cleaned.get(CONTENT_LICENSE_DESCRIPTION_KEY, None),
- copyright_holder=row_cleaned.get(CONTENT_LICENSE_COPYRIGHT_HOLDER_KEY, None)
+ copyright_holder=row_cleaned.get(
+ CONTENT_LICENSE_COPYRIGHT_HOLDER_KEY, None
+ ),
)
else:
license_dict = None
@@ -354,12 +380,10 @@ def _map_content_row_to_dict(self, row):
author=row_cleaned.get(CONTENT_AUTHOR_KEY, None),
language=row_cleaned.get(CONTENT_LANGUAGE_KEY, None),
license=license_dict,
- thumbnail_chan_path=row_cleaned.get(CONTENT_THUMBNAIL_KEY, None)
+ thumbnail_chan_path=row_cleaned.get(CONTENT_THUMBNAIL_KEY, None),
)
return row_dict
-
-
# EXERCISES CSV PARSING METHODS
############################################################################
@@ -378,7 +402,6 @@ def get_exercises_for_dir(self, dir_path_tuple):
"""
return self.exercise_filenames_in_dir[dir_path_tuple]
-
def _map_exercise_row_to_dict(self, row):
"""
Convert dictionary keys from raw CSV Exercise format to ricecooker keys.
@@ -389,7 +412,9 @@ def _map_exercise_row_to_dict(self, row):
license_dict = dict(
license_id=row_cleaned[CONTENT_LICENSE_ID_KEY],
description=row_cleaned.get(CONTENT_LICENSE_DESCRIPTION_KEY, None),
- copyright_holder=row_cleaned.get(CONTENT_LICENSE_COPYRIGHT_HOLDER_KEY, None)
+ copyright_holder=row_cleaned.get(
+ CONTENT_LICENSE_COPYRIGHT_HOLDER_KEY, None
+ ),
)
else:
license_dict = None
@@ -401,17 +426,19 @@ def _map_exercise_row_to_dict(self, row):
elif randomize_raw.lower() in CSV_STR_FALSE_VALUES:
randomize = False
else:
- raise ValueError('Unrecognized value ' + randomize_raw + ' for randomzied key')
+ raise ValueError(
+ "Unrecognized value " + randomize_raw + " for randomzied key"
+ )
exercise_data = dict(
mastery_model=exercises.M_OF_N,
randomize=randomize,
)
m_value = row_cleaned.get(EXERCISE_M_KEY, None)
if m_value:
- exercise_data['m'] = int(m_value)
+ exercise_data["m"] = int(m_value)
n_value = row_cleaned.get(EXERCISE_N_KEY, None)
if n_value:
- exercise_data['n'] = int(n_value)
+ exercise_data["n"] = int(n_value)
exercise_dict = dict(
chan_path=row_cleaned[CONTENT_PATH_KEY],
@@ -422,7 +449,7 @@ def _map_exercise_row_to_dict(self, row):
language=row_cleaned.get(CONTENT_LANGUAGE_KEY, None),
license=license_dict,
exercise_data=exercise_data,
- thumbnail_chan_path=row_cleaned.get(CONTENT_THUMBNAIL_KEY, None)
+ thumbnail_chan_path=row_cleaned.get(CONTENT_THUMBNAIL_KEY, None),
)
return exercise_dict
@@ -518,14 +545,12 @@ def _map_exercise_question_row_to_dict(self, row):
hints=hints,
)
elif question_type == exercises.PERSEUS_QUESTION:
- raise ValueError('Perseus questions not currently supported in CSV workflow.')
+ raise ValueError(
+ "Perseus questions not currently supported in CSV workflow."
+ )
return question_dict
-
-
-
-
# CSV VALIDATION METHODS
############################################################################
@@ -537,8 +562,12 @@ def validate_headers(self):
self.validate_header(self.channeldir, self.channelinfo, CHANNEL_INFO_HEADER)
self.validate_header(self.channeldir, self.contentinfo, CONTENT_INFO_HEADER)
if self.has_exercises():
- self.validate_header(self.channeldir, self.exercisesinfo, EXERCISE_INFO_HEADER)
- self.validate_header(self.channeldir, self.questionsinfo, EXERCISE_QUESTIONS_INFO_HEADER)
+ self.validate_header(
+ self.channeldir, self.exercisesinfo, EXERCISE_INFO_HEADER
+ )
+ self.validate_header(
+ self.channeldir, self.questionsinfo, EXERCISE_QUESTIONS_INFO_HEADER
+ )
def validate_header(self, channeldir, filename, expected_header):
"""
@@ -550,8 +579,12 @@ def validate_header(self, channeldir, filename, expected_header):
dict_reader = csv.DictReader(csv_lines)
actual = set(dict_reader.fieldnames)
if not actual == expected:
- raise ValueError('Unexpected CSV file header in ' + csv_filename \
- + ' Expected header:' + str(expected))
+ raise ValueError(
+ "Unexpected CSV file header in "
+ + csv_filename
+ + " Expected header:"
+ + str(expected)
+ )
def validate(self):
"""
@@ -559,7 +592,6 @@ def validate(self):
"""
pass # TODO
-
# Generate CSV metadata from a given studio_id
############################################################################
@@ -568,117 +600,124 @@ def generate_exercises_from_importstudioid(self, args, options):
Create rows in Exercises.csv and ExerciseQuestions.csv from a Studio channel,
specified based on a studio_id (e.g. studio_id of main_tree for some channel)'
"""
- print('Generating Exercises.csv and ExerciseQuestions.csv from a Studio channel')
- self.studioapi = StudioApi(token=args['token'])
- channel_dict = self.studioapi.get_tree_for_studio_id(args['importstudioid'])
- json.dump(channel_dict, open('chefdata/studiotree.json', 'w'), indent=4, ensure_ascii=False, sort_keys=True)
+ print(
+ "Generating Exercises.csv and ExerciseQuestions.csv from a Studio channel"
+ )
+ self.studioapi = StudioApi(token=args["token"])
+ channel_dict = self.studioapi.get_tree_for_studio_id(args["importstudioid"])
+ json.dump(
+ channel_dict,
+ open("chefdata/studiotree.json", "w"),
+ indent=4,
+ ensure_ascii=False,
+ sort_keys=True,
+ )
soure_ids_seen = []
+
def _generate_source_id(subtree):
"""
Creates a Source ID form title and ensures it is unique withing channel.
"""
- candidate = subtree['title'].replace(' ', '_')
+ candidate = subtree["title"].replace(" ", "_")
if candidate not in soure_ids_seen:
source_id = candidate
soure_ids_seen.append(source_id)
else:
- source_id = candidate + subtree['node_id'][0:7]
+ source_id = candidate + subtree["node_id"][0:7]
soure_ids_seen.append(source_id)
return source_id
def _write_subtree(path_tuple, subtree, is_root=False):
- print(' '*len(path_tuple) + ' - ', subtree['title'])
- kind = subtree['kind']
+ print(" " * len(path_tuple) + " - ", subtree["title"])
+ kind = subtree["kind"]
# TOPIC ############################################################
- if kind == 'topic':
+ if kind == "topic":
if is_root:
- self.write_topic_row_from_studio_dict(path_tuple, subtree, is_root=is_root)
- for child in subtree['children']:
+ self.write_topic_row_from_studio_dict(
+ path_tuple, subtree, is_root=is_root
+ )
+ for child in subtree["children"]:
_write_subtree(path_tuple, child)
else:
self.write_topic_row_from_studio_dict(path_tuple, subtree)
- for child in subtree['children']:
- _write_subtree(path_tuple+[subtree['title']], child)
+ for child in subtree["children"]:
+ _write_subtree(path_tuple + [subtree["title"]], child)
# EXERCISE #########################################################
- elif kind == 'exercise':
+ elif kind == "exercise":
source_id = _generate_source_id(subtree)
self.write_exercice_row_from_studio_dict(path_tuple, subtree, source_id)
- for question_dict in subtree['assessment_items']:
+ for question_dict in subtree["assessment_items"]:
self.write_question_row_from_question_dict(source_id, question_dict)
else:
- print('skipping node', subtree['title'])
+ print("skipping node", subtree["title"])
- path_tuple = [ self.channeldir.split('/')[-1] ]
+ path_tuple = [self.channeldir.split("/")[-1]]
_write_subtree(path_tuple, channel_dict, is_root=True)
def write_commont_studio_dict_from_row(self, studio_dict, row):
- if studio_dict['license']:
- license_dict = self.studioapi.licenses_by_id[studio_dict['license']]
+ if studio_dict["license"]:
+ license_dict = self.studioapi.licenses_by_id[studio_dict["license"]]
else:
- license_dict = {'license_name': None}
- row[CONTENT_TITLE_KEY] = studio_dict['title']
- row[CONTENT_DESCRIPTION_KEY] = studio_dict['description']
- row[CONTENT_AUTHOR_KEY] = studio_dict['author']
- row[CONTENT_LANGUAGE_KEY] = 'en'
- row[CONTENT_LICENSE_ID_KEY] = license_dict['license_name']
+ license_dict = {"license_name": None}
+ row[CONTENT_TITLE_KEY] = studio_dict["title"]
+ row[CONTENT_DESCRIPTION_KEY] = studio_dict["description"]
+ row[CONTENT_AUTHOR_KEY] = studio_dict["author"]
+ row[CONTENT_LANGUAGE_KEY] = "en"
+ row[CONTENT_LICENSE_ID_KEY] = license_dict["license_name"]
row[CONTENT_LICENSE_DESCRIPTION_KEY] = None
- row[CONTENT_LICENSE_COPYRIGHT_HOLDER_KEY] = studio_dict['copyright_holder']
+ row[CONTENT_LICENSE_COPYRIGHT_HOLDER_KEY] = studio_dict["copyright_holder"]
row[CONTENT_THUMBNAIL_KEY] = None
-
def write_topic_row_from_studio_dict(self, path_tuple, studio_dict, is_root=False):
if is_root:
return
# print('Generating Content.csv rows folders and file in channeldir for path_tuple ', path_tuple, studio_dict['title'])
file_path = get_metadata_file_path(self.channeldir, self.contentinfo)
- with open(file_path, 'a') as csv_file:
+ with open(file_path, "a") as csv_file:
csvwriter = csv.DictWriter(csv_file, CONTENT_INFO_HEADER)
- title = studio_dict['title']
- path_with_self = '/'.join(path_tuple+[title])
+ title = studio_dict["title"]
+ path_with_self = "/".join(path_tuple + [title])
if not os.path.exists(path_with_self):
os.makedirs(path_with_self, exist_ok=True)
topic_row = {}
self.write_commont_studio_dict_from_row(studio_dict, topic_row)
# WRITE TOPIC ROW
topic_row[CONTENT_PATH_KEY] = path_with_self
- topic_row[CONTENT_SOURCEID_KEY] = studio_dict['node_id'][0:7]
+ topic_row[CONTENT_SOURCEID_KEY] = studio_dict["node_id"][0:7]
csvwriter.writerow(topic_row)
-
def write_exercice_row_from_studio_dict(self, path_tuple, studio_dict, source_id):
file_path = get_metadata_file_path(self.channeldir, self.exercisesinfo)
- with open(file_path, 'a') as csv_file:
+ with open(file_path, "a") as csv_file:
csvwriter = csv.DictWriter(csv_file, EXERCISE_INFO_HEADER)
exercise_row = {}
self.write_commont_studio_dict_from_row(studio_dict, exercise_row)
- exercise_title = studio_dict['title']
- exercise_row[CONTENT_PATH_KEY] = '/'.join(path_tuple+[exercise_title])
+ exercise_title = studio_dict["title"]
+ exercise_row[CONTENT_PATH_KEY] = "/".join(path_tuple + [exercise_title])
exercise_row[EXERCISE_SOURCEID_KEY] = source_id
# Exercises specifics
- if isinstance(studio_dict['extra_fields'], str):
- extra_fields = json.loads(studio_dict['extra_fields'])
+ if isinstance(studio_dict["extra_fields"], str):
+ extra_fields = json.loads(studio_dict["extra_fields"])
else:
- extra_fields = studio_dict['extra_fields']
- exercise_row[EXERCISE_M_KEY] = int(extra_fields['m'])
- exercise_row[EXERCISE_N_KEY] = int(extra_fields['n'])
- exercise_row[EXERCISE_RANDOMIZE_KEY] = extra_fields['randomize']
+ extra_fields = studio_dict["extra_fields"]
+ exercise_row[EXERCISE_M_KEY] = int(extra_fields["m"])
+ exercise_row[EXERCISE_N_KEY] = int(extra_fields["n"])
+ exercise_row[EXERCISE_RANDOMIZE_KEY] = extra_fields["randomize"]
# WRITE EXERCISE ROW
csvwriter.writerow(exercise_row)
-
-
def _make_local_question_images(self, question_dict):
"""
Process all mardown image links in question_dict:
- download them to local files under exerciseimages/
"""
question_dict = question_dict.copy()
- dest_path = 'exerciseimages/'
+ dest_path = "exerciseimages/"
if not os.path.exists(dest_path):
os.mkdir(dest_path)
@@ -686,53 +725,58 @@ def _make_local_question_images(self, question_dict):
# helper method
def _process_string(string):
image_regex = re.compile(MARKDOWN_IMAGE_REGEX, flags=re.IGNORECASE)
- contentstorage_prefix = '${☣ CONTENTSTORAGE}/'
- studio_storage = 'https://studio.learningequality.org/content/storage/'
+ contentstorage_prefix = "${☣ CONTENTSTORAGE}/"
+ studio_storage = "https://studio.learningequality.org/content/storage/"
matches = image_regex.findall(string)
# Parse all matches
for match in matches:
file_result = match[1]
- file_name = file_result.replace(contentstorage_prefix, '')
- file_url = studio_storage + file_name[0] + '/' + file_name[1] + '/' + file_name
+ file_name = file_result.replace(contentstorage_prefix, "")
+ file_url = (
+ studio_storage + file_name[0] + "/" + file_name[1] + "/" + file_name
+ )
file_local_path = os.path.join(dest_path, file_name)
response = requests.get(file_url)
if response.status_code != 200:
- print('Failed for image ' + str(response.status_code) + ' >> ' + file_url)
+ print(
+ "Failed for image "
+ + str(response.status_code)
+ + " >> "
+ + file_url
+ )
return string
- with open(file_local_path, 'wb') as local_file:
+ with open(file_local_path, "wb") as local_file:
local_file.write(response.content)
- print('saved image file', file_local_path)
+ print("saved image file", file_local_path)
string = string.replace(file_result, file_local_path)
return string
# Process images in question
- new_question = _process_string(question_dict['question'])
- question_dict['question'] = new_question
+ new_question = _process_string(question_dict["question"])
+ question_dict["question"] = new_question
# Process images in answers
- answers = json.loads(question_dict['answers'])
+ answers = json.loads(question_dict["answers"])
new_answers = []
for ans in answers:
new_ans = ans.copy()
- new_ans['answer'] = _process_string(new_ans['answer'])
+ new_ans["answer"] = _process_string(new_ans["answer"])
new_answers.append(new_ans)
- question_dict['answers'] = json.dumps(new_answers)
+ question_dict["answers"] = json.dumps(new_answers)
# TODO: process hint images
return question_dict
-
-
def write_question_row_from_question_dict(self, source_id, question_dict):
file_path = get_metadata_file_path(self.channeldir, self.questionsinfo)
- if question_dict['type'] == 'perseus_question':
- print('Skipping perseus_question -- not supported in CSV workflow.')
+ if question_dict["type"] == "perseus_question":
+ print("Skipping perseus_question -- not supported in CSV workflow.")
return
- with open(file_path, 'a') as csv_file:
+ with open(file_path, "a") as csv_file:
csvwriter = csv.DictWriter(csv_file, EXERCISE_QUESTIONS_INFO_HEADER)
def _safe_list_get(l, idx, default):
@@ -745,48 +789,71 @@ def _safe_list_get(l, idx, default):
question_dict = self._make_local_question_images(question_dict)
type_lookup = {
- 'single_selection': exercises.SINGLE_SELECTION,
- 'true_false': exercises.SINGLE_SELECTION,
- 'multiple_selection': exercises.MULTIPLE_SELECTION,
- 'input_question': exercises.INPUT_QUESTION,
+ "single_selection": exercises.SINGLE_SELECTION,
+ "true_false": exercises.SINGLE_SELECTION,
+ "multiple_selection": exercises.MULTIPLE_SELECTION,
+ "input_question": exercises.INPUT_QUESTION,
}
# ANSWERS
- answers = json.loads(question_dict['answers'])
+ answers = json.loads(question_dict["answers"])
options = [] # all options
correct = [] # correct andwers
for ans in answers:
- options.append(ans['answer'])
- if ans['correct']:
- correct.append(ans['answer'])
+ options.append(ans["answer"])
+ if ans["correct"]:
+ correct.append(ans["answer"])
extra_options = DEFAULT_EXTRA_ITEMS_SEPARATOR.join(options[5:])
# HINTS
- hints_raw = json.loads(question_dict['hints'])
+ hints_raw = json.loads(question_dict["hints"])
if hints_raw:
- raise ValueError('Found hints but not handled..')
+ raise ValueError("Found hints but not handled..")
- LOGGER.info(' - writing question with studio_id=' + question_dict['assessment_id'])
+ LOGGER.info(
+ " - writing question with studio_id="
+ + question_dict["assessment_id"]
+ )
question_row = {}
question_row[EXERCISE_SOURCEID_KEY] = source_id
- question_row[EXERCISE_QUESTIONS_QUESTIONID_KEY] = question_dict['assessment_id'] # question_dict['assessment_id']
- question_row[EXERCISE_QUESTIONS_TYPE_KEY] = type_lookup[question_dict['type']]
- question_row[EXERCISE_QUESTIONS_QUESTION_KEY] = question_dict['question']
- question_row[EXERCISE_QUESTIONS_OPTION_A_KEY] = _safe_list_get(options, 0, None)
- question_row[EXERCISE_QUESTIONS_OPTION_B_KEY] = _safe_list_get(options, 1, None)
- question_row[EXERCISE_QUESTIONS_OPTION_C_KEY] = _safe_list_get(options, 2, None)
- question_row[EXERCISE_QUESTIONS_OPTION_D_KEY] = _safe_list_get(options, 3, None)
- question_row[EXERCISE_QUESTIONS_OPTION_E_KEY] = _safe_list_get(options, 4, None)
+ question_row[EXERCISE_QUESTIONS_QUESTIONID_KEY] = question_dict[
+ "assessment_id"
+ ] # question_dict['assessment_id']
+ question_row[EXERCISE_QUESTIONS_TYPE_KEY] = type_lookup[
+ question_dict["type"]
+ ]
+ question_row[EXERCISE_QUESTIONS_QUESTION_KEY] = question_dict["question"]
+ question_row[EXERCISE_QUESTIONS_OPTION_A_KEY] = _safe_list_get(
+ options, 0, None
+ )
+ question_row[EXERCISE_QUESTIONS_OPTION_B_KEY] = _safe_list_get(
+ options, 1, None
+ )
+ question_row[EXERCISE_QUESTIONS_OPTION_C_KEY] = _safe_list_get(
+ options, 2, None
+ )
+ question_row[EXERCISE_QUESTIONS_OPTION_D_KEY] = _safe_list_get(
+ options, 3, None
+ )
+ question_row[EXERCISE_QUESTIONS_OPTION_E_KEY] = _safe_list_get(
+ options, 4, None
+ )
question_row[EXERCISE_QUESTIONS_OPTION_FGHI_KEY] = extra_options
- question_row[EXERCISE_QUESTIONS_CORRECT_ANSWER_KEY] = _safe_list_get(correct, 0, None)
- question_row[EXERCISE_QUESTIONS_CORRECT_ANSWER2_KEY] = _safe_list_get(correct, 1, None)
- question_row[EXERCISE_QUESTIONS_CORRECT_ANSWER3_KEY] = _safe_list_get(correct, 2, None)
- question_row[EXERCISE_QUESTIONS_HINT_1_KEY] = None # TODO
- question_row[EXERCISE_QUESTIONS_HINT_2_KEY] = None # TODO
- question_row[EXERCISE_QUESTIONS_HINT_3_KEY] = None # TODO
- question_row[EXERCISE_QUESTIONS_HINT_4_KEY] = None # TODO
- question_row[EXERCISE_QUESTIONS_HINT_5_KEY] = None # TODO
- question_row[EXERCISE_QUESTIONS_HINT_6789_KEY] = None # TODO
+ question_row[EXERCISE_QUESTIONS_CORRECT_ANSWER_KEY] = _safe_list_get(
+ correct, 0, None
+ )
+ question_row[EXERCISE_QUESTIONS_CORRECT_ANSWER2_KEY] = _safe_list_get(
+ correct, 1, None
+ )
+ question_row[EXERCISE_QUESTIONS_CORRECT_ANSWER3_KEY] = _safe_list_get(
+ correct, 2, None
+ )
+ question_row[EXERCISE_QUESTIONS_HINT_1_KEY] = None # TODO
+ question_row[EXERCISE_QUESTIONS_HINT_2_KEY] = None # TODO
+ question_row[EXERCISE_QUESTIONS_HINT_3_KEY] = None # TODO
+ question_row[EXERCISE_QUESTIONS_HINT_4_KEY] = None # TODO
+ question_row[EXERCISE_QUESTIONS_HINT_5_KEY] = None # TODO
+ question_row[EXERCISE_QUESTIONS_HINT_6789_KEY] = None # TODO
# WRITE QUESTION ROW
csvwriter.writerow(question_row)
# 'files': [],
@@ -796,9 +863,6 @@ def _safe_list_get(l, idx, default):
# 'randomize': True,
# 'deleted': False},
-
-
-
# Generate CSV from folder structure in channeldir
############################################################################
@@ -806,34 +870,44 @@ def generate_contentinfo_from_channeldir(self, args, options):
"""
Create rows in Content.csv for each folder and file in `self.channeldir`.
"""
- LOGGER.info('Generating Content.csv rows folders and file in channeldir')
+ LOGGER.info("Generating Content.csv rows folders and file in channeldir")
file_path = get_metadata_file_path(self.channeldir, self.contentinfo)
- with open(file_path, 'a') as csv_file:
+ with open(file_path, "a") as csv_file:
csvwriter = csv.DictWriter(csv_file, CONTENT_INFO_HEADER)
- channeldir = args['channeldir']
+ channeldir = args["channeldir"]
if channeldir.endswith(os.path.sep):
channeldir.rstrip(os.path.sep)
# MAIN PROCESSING OF os.walk OUTPUT
content_folders = sorted(os.walk(channeldir))
- _ = content_folders.pop(0) # Skip over channel root folder
+ _ = content_folders.pop(0) # Skip over channel root folder
for rel_path, _subfolders, filenames in content_folders:
- LOGGER.info('processing folder ' + str(rel_path))
+ LOGGER.info("processing folder " + str(rel_path))
sorted_filenames = sorted(filenames)
- self.generate_contentinfo_from_folder(csvwriter, rel_path, sorted_filenames)
- LOGGER.info('Generted {} row for all folders and files in {}'.format(self.contentinfo, self.channeldir))
+ self.generate_contentinfo_from_folder(
+ csvwriter, rel_path, sorted_filenames
+ )
+ LOGGER.info(
+ "Generted {} row for all folders and files in {}".format(
+ self.contentinfo, self.channeldir
+ )
+ )
def generate_contentinfo_from_folder(self, csvwriter, rel_path, filenames):
"""
Create a topic node row in Content.csv for the folder at `rel_path` and
add content node rows for all the files in the `rel_path` folder.
"""
- LOGGER.debug('IN process_folder ' + str(rel_path) + ' ' + str(filenames))
- from ricecooker.utils.linecook import filter_filenames, filter_thumbnail_files, chan_path_from_rel_path
+ LOGGER.debug("IN process_folder " + str(rel_path) + " " + str(filenames))
+ from ricecooker.utils.linecook import (
+ filter_filenames,
+ filter_thumbnail_files,
+ chan_path_from_rel_path,
+ )
# WRITE TOPIC ROW
- topicrow = self.channeldir_node_to_row( rel_path.split(os.path.sep) )
+ topicrow = self.channeldir_node_to_row(rel_path.split(os.path.sep))
csvwriter.writerow(topicrow)
# WRITE CONTENT NODE ROWS
@@ -846,7 +920,6 @@ def generate_contentinfo_from_folder(self, csvwriter, rel_path, filenames):
filerow = self.channeldir_node_to_row(path_tuple)
csvwriter.writerow(filerow)
-
def channeldir_node_to_row(self, path_tuple):
"""
Return a dict with keys corresponding to Content.csv columns.
@@ -854,18 +927,17 @@ def channeldir_node_to_row(self, path_tuple):
row = dict()
for key in CONTENT_INFO_HEADER:
row[key] = None
- row[CONTENT_PATH_KEY] = "/".join(path_tuple) # use / in .csv on Windows and UNIX
- title = path_tuple[-1].replace('_', ' ')
+ row[CONTENT_PATH_KEY] = "/".join(
+ path_tuple
+ ) # use / in .csv on Windows and UNIX
+ title = path_tuple[-1].replace("_", " ")
for ext in content_kinds.MAPPING.keys():
if title.endswith(ext):
- title = title.replace('.'+ext, '')
+ title = title.replace("." + ext, "")
row[CONTENT_TITLE_KEY] = title
row[CONTENT_SOURCEID_KEY] = path_tuple[-1]
return row
-
-
-
# UTILS
############################################################################
@@ -874,19 +946,27 @@ def generate_templates(self, exercise_questions=False):
Create empty .csv files with the right headers and place them in the
Will place files as siblings of directory `channeldir`.
"""
- self.generate_template(channeldir=self.channeldir,
- filename=self.channelinfo,
- header=CHANNEL_INFO_HEADER)
- self.generate_template(channeldir=self.channeldir,
- filename=self.contentinfo,
- header=CONTENT_INFO_HEADER)
+ self.generate_template(
+ channeldir=self.channeldir,
+ filename=self.channelinfo,
+ header=CHANNEL_INFO_HEADER,
+ )
+ self.generate_template(
+ channeldir=self.channeldir,
+ filename=self.contentinfo,
+ header=CONTENT_INFO_HEADER,
+ )
if exercise_questions:
- self.generate_template(channeldir=self.channeldir,
- filename=self.exercisesinfo,
- header=EXERCISE_INFO_HEADER)
- self.generate_template(channeldir=self.channeldir,
- filename=self.questionsinfo,
- header=EXERCISE_QUESTIONS_INFO_HEADER)
+ self.generate_template(
+ channeldir=self.channeldir,
+ filename=self.exercisesinfo,
+ header=EXERCISE_INFO_HEADER,
+ )
+ self.generate_template(
+ channeldir=self.channeldir,
+ filename=self.questionsinfo,
+ header=EXERCISE_QUESTIONS_INFO_HEADER,
+ )
def generate_template(self, channeldir, filename, header):
"""
@@ -895,7 +975,7 @@ def generate_template(self, channeldir, filename, header):
"""
file_path = get_metadata_file_path(channeldir, filename)
if not os.path.exists(file_path):
- with open(file_path, 'w') as csv_file:
+ with open(file_path, "w") as csv_file:
csvwriter = csv.DictWriter(csv_file, header)
csvwriter.writeheader()
@@ -905,7 +985,7 @@ def _read_csv_lines(path):
Opens CSV file `path` and returns list of rows.
Pass output of this function to `csv.DictReader` for reading data.
"""
- csv_file = open(path, 'r')
+ csv_file = open(path, "r")
csv_lines_raw = csv_file.readlines()
csv_lines_clean = [line for line in csv_lines_raw if len(line.strip()) > 0]
return csv_lines_clean
@@ -917,15 +997,13 @@ def _clean_dict(row):
"""
row_cleaned = {}
for key, val in row.items():
- if val is None or val == '':
+ if val is None or val == "":
row_cleaned[key] = None
else:
row_cleaned[key] = val
return row_cleaned
-
-
class ExcelMetadataProvider(MetadataProvider):
# LIBRARIES COULD USE
# https://github.com/jmcnamara/XlsxWriter/blob/95334f999d3a5fb58d8da3197260e920be357638/dev/docs/source/alternatives.rst
diff --git a/ricecooker/utils/paths.py b/ricecooker/utils/paths.py
index c522b109..002abf0f 100644
--- a/ricecooker/utils/paths.py
+++ b/ricecooker/utils/paths.py
@@ -15,7 +15,7 @@ def file_exists(filepath):
def get_name_from_url(url):
"""
- get the filename from a url
+ get the filename from a url
url = http://abc.com/xyz.txt
get_name_from_url(url) -> xyz.txt
"""
@@ -37,7 +37,7 @@ def get_name_from_url(url):
def get_name_from_url_no_ext(url):
"""
- get the filename without the extension name from a url
+ get the filename without the extension name from a url
url = http://abc.com/xyz.txt
get_name_from_url(url) -> xyz
"""
diff --git a/ricecooker/utils/pdf.py b/ricecooker/utils/pdf.py
index 0fc58daf..9b3562e1 100644
--- a/ricecooker/utils/pdf.py
+++ b/ricecooker/utils/pdf.py
@@ -1,8 +1,11 @@
import os
-from PyPDF2 import PdfFileWriter, PdfFileReader
-from PyPDF2.generic import Destination, NullObject
+from PyPDF2 import PdfFileReader
+from PyPDF2 import PdfFileWriter
+from PyPDF2.generic import Destination
+from PyPDF2.generic import NullObject
from PyPDF2.utils import PdfReadError
+
from ricecooker.utils.downloader import read
@@ -13,6 +16,7 @@ def __init__(self, title, page, typ, *args):
except PdfReadError:
pass
+
class CustomPDFReader(PdfFileReader):
def _buildDestination(self, title, array):
page, typ = array[0:2]
@@ -24,7 +28,8 @@ class PDFParser(object):
"""
Helper class for extracting table of contents and splitting PDFs into chapters.
"""
- path = None # Local path to source PDF document that will be processed
+
+ path = None # Local path to source PDF document that will be processed
def __init__(self, source_path, directory="downloads"):
self.directory = directory
@@ -58,20 +63,19 @@ def open(self, update=False):
with open(self.path, "wb") as fobj:
fobj.write(read(self.source_path))
- self.file = open(self.path, 'rb')
+ self.file = open(self.path, "rb")
self.pdf = CustomPDFReader(self.file)
def close(self):
"""
Close main pdf file when done.
"""
- self.file.close() # Make sure zipfile closes no matter what
+ self.file.close() # Make sure zipfile closes no matter what
def check_path(self):
if not self.path:
raise ValueError("self.path not found; call `open` first")
-
def get_toc(self, subchapters=False):
"""
Returns table-of-contents information extracted from the PDF doc.
@@ -96,10 +100,12 @@ def get_toc(self, subchapters=False):
for dest in self.pdf.getOutlines():
# Process chapters
- if isinstance(dest, CustomDestination) and not isinstance(dest['/Page'], NullObject):
+ if isinstance(dest, CustomDestination) and not isinstance(
+ dest["/Page"], NullObject
+ ):
page_num = self.pdf.getDestinationPageNumber(dest)
chapter_pagerange = {
- "title": dest['/Title'].replace('\xa0', ' '),
+ "title": dest["/Title"].replace("\xa0", " "),
"page_start": page_num if index != 0 else 0,
"page_end": self.pdf.numPages,
}
@@ -122,37 +128,43 @@ def get_toc(self, subchapters=False):
parent = chapters[index - 1]
subindex = 0
for subdest in dest:
- if isinstance(subdest, CustomDestination) and not isinstance(subdest['/Page'], NullObject):
+ if isinstance(subdest, CustomDestination) and not isinstance(
+ subdest["/Page"], NullObject
+ ):
subpage_num = self.pdf.getDestinationPageNumber(subdest)
- parent['children'].append({
- "title": subdest['/Title'].replace('\xa0', ' '),
- "page_start": subpage_num,
- "page_end": self.pdf.numPages
- })
+ parent["children"].append(
+ {
+ "title": subdest["/Title"].replace("\xa0", " "),
+ "page_start": subpage_num,
+ "page_end": self.pdf.numPages,
+ }
+ )
if subindex > 0:
- parent['children'][subindex - 1]["page_end"] = subpage_num
- subindex +=1
+ parent["children"][subindex - 1]["page_end"] = subpage_num
+ subindex += 1
return chapters
-
- def write_pagerange(self, pagerange, prefix=''):
+ def write_pagerange(self, pagerange, prefix=""):
"""
Save the subset of pages specified in `pagerange` (dict) as separate PDF.
e.g. pagerange = {'title':'First chapter', 'page_start':0, 'page_end':5}
"""
writer = PdfFileWriter()
- slug = "".join([c for c in pagerange['title'].replace(" ", "-") if c.isalnum() or c == "-"])
- write_to_path = os.path.sep.join([self.directory, "{}{}.pdf".format(prefix, slug)])
- for page in range(pagerange['page_start'], pagerange['page_end']):
+ slug = "".join(
+ [c for c in pagerange["title"].replace(" ", "-") if c.isalnum() or c == "-"]
+ )
+ write_to_path = os.path.sep.join(
+ [self.directory, "{}{}.pdf".format(prefix, slug)]
+ )
+ for page in range(pagerange["page_start"], pagerange["page_end"]):
writer.addPage(self.pdf.getPage(page))
- writer.removeLinks() # must be done every page
- with open(write_to_path, 'wb') as outfile:
+ writer.removeLinks() # must be done every page
+ with open(write_to_path, "wb") as outfile:
writer.write(outfile)
return write_to_path
-
- def split_chapters(self, jsondata=None, prefix=''):
+ def split_chapters(self, jsondata=None, prefix=""):
"""
Split the PDF doc into individual chapters based on the page-range info,
storing individual split PDFs in the output folder `self.directory`.
@@ -164,12 +176,11 @@ def split_chapters(self, jsondata=None, prefix=''):
toc = jsondata or self.get_toc()
chapters = []
for index, chpagerange in enumerate(toc):
- newprefix = prefix + str(index) + '-'
+ newprefix = prefix + str(index) + "-"
write_to_path = self.write_pagerange(chpagerange, prefix=newprefix)
- chapters.append({"title": chpagerange['title'], "path": write_to_path})
+ chapters.append({"title": chpagerange["title"], "path": write_to_path})
return chapters
-
def split_subchapters(self, jsondata=None):
"""
Transform a PDF doc into tree of chapters (topics) and subchapters (docs)
@@ -184,31 +195,37 @@ def split_subchapters(self, jsondata=None):
for index, chpagerange in enumerate(toc):
# chapter prefix of the form 1-, 2-, 3-,... to avoid name conflicsts
- chprefix = str(index) + '-'
+ chprefix = str(index) + "-"
# Case A: chapter with no subchapters
- if 'children' not in chpagerange or not chpagerange['children']:
+ if "children" not in chpagerange or not chpagerange["children"]:
write_to_path = self.write_pagerange(chpagerange, prefix=chprefix)
- chapters.append({"title": chpagerange['title'], "path": write_to_path})
+ chapters.append({"title": chpagerange["title"], "path": write_to_path})
# Case B: chapter with subchapters
- elif 'children' in chpagerange:
- chapter_topic = { 'title': chpagerange['title'], 'children': [] }
- subchpageranges = chpagerange['children']
+ elif "children" in chpagerange:
+ chapter_topic = {"title": chpagerange["title"], "children": []}
+ subchpageranges = chpagerange["children"]
first_subchapter = subchpageranges[0]
# Handle case when chapter has "intro pages" before first subchapter
- if first_subchapter['page_start'] > chpagerange['page_start']:
+ if first_subchapter["page_start"] > chpagerange["page_start"]:
chintro_pagerange = {
- 'title': chpagerange['title'],
- 'page_start': chpagerange['page_start'],
- 'page_end': first_subchapter['page_start']
+ "title": chpagerange["title"],
+ "page_start": chpagerange["page_start"],
+ "page_end": first_subchapter["page_start"],
}
- write_to_path = self.write_pagerange(chintro_pagerange, prefix=chprefix)
- chapter_topic['children'].append({"title": chpagerange['title'], "path": write_to_path})
+ write_to_path = self.write_pagerange(
+ chintro_pagerange, prefix=chprefix
+ )
+ chapter_topic["children"].append(
+ {"title": chpagerange["title"], "path": write_to_path}
+ )
# Handle all subchapters
- subchapter_nodes = self.split_chapters(jsondata=subchpageranges, prefix=chprefix)
- chapter_topic['children'].extend(subchapter_nodes)
+ subchapter_nodes = self.split_chapters(
+ jsondata=subchpageranges, prefix=chprefix
+ )
+ chapter_topic["children"].extend(subchapter_nodes)
chapters.append(chapter_topic)
return chapters
diff --git a/ricecooker/utils/proxy.py b/ricecooker/utils/proxy.py
index 5741bad1..1345eb84 100644
--- a/ricecooker/utils/proxy.py
+++ b/ricecooker/utils/proxy.py
@@ -7,40 +7,41 @@
import os
import random
import re
-import requests
import time
+import requests
-PROXY_LIST = [] # Current list of proxy servers to choose from
-
-RECENT_PROXIES = [] # Recently used proxies (to avoid using too often)
-RECENT_MAX = 3 # Rotatate between at least 3 proxy servers
+PROXY_LIST = [] # Current list of proxy servers to choose from
-MAYBE_BROKEN_PROXIES = {} # {proxy: error_list} to keep track of proxy errors
-ERROR_FORGET_TIME = 10 # Ignore proxy errors that are older than 10 mins
-ERROR_THRESHOLD = 3 # Add to broken list if encounter 3 errs in 10 mins
+RECENT_PROXIES = [] # Recently used proxies (to avoid using too often)
+RECENT_MAX = 3 # Rotatate between at least 3 proxy servers
-BROKEN_PROXIES = [] # Known-bad proxies (we want to void choosing these)
-BROKEN_PROXIES_CACHE_FILENAME = 'broken_proxies.list'
-BROKEN_CACHE_EXPIRE_MINS = 2*24*60 # Ignore broken proxy cache older than 2 days
+MAYBE_BROKEN_PROXIES = {} # {proxy: error_list} to keep track of proxy errors
+ERROR_FORGET_TIME = 10 # Ignore proxy errors that are older than 10 mins
+ERROR_THRESHOLD = 3 # Add to broken list if encounter 3 errs in 10 mins
+BROKEN_PROXIES = [] # Known-bad proxies (we want to void choosing these)
+BROKEN_PROXIES_CACHE_FILENAME = "broken_proxies.list"
+BROKEN_CACHE_EXPIRE_MINS = 2 * 24 * 60 # Ignore broken proxy cache older than 2 days
# LOADERS
################################################################################
+
def load_env_proxies():
"""
Load data from the ENV variable PROXY_LIST (a ;-sparated list of proxies).
"""
- proxy_list_env_var = os.getenv('PROXY_LIST', None)
- proxy_list_env_var = proxy_list_env_var.strip(';').strip()
+ proxy_list_env_var = os.getenv("PROXY_LIST", None)
+ proxy_list_env_var = proxy_list_env_var.strip(";").strip()
if proxy_list_env_var:
- return [proxy.strip() for proxy in proxy_list_env_var.split(';')]
+ return [proxy.strip() for proxy in proxy_list_env_var.split(";")]
else:
return []
+
def load_broken_proxies_cache():
"""
Load data from 'broken_proxies.list' if the file not too old.
@@ -48,15 +49,15 @@ def load_broken_proxies_cache():
if not os.path.exists(BROKEN_PROXIES_CACHE_FILENAME):
return []
mtime = os.path.getmtime(BROKEN_PROXIES_CACHE_FILENAME)
- if (time.time() - mtime) > 60*BROKEN_CACHE_EXPIRE_MINS:
+ if (time.time() - mtime) > 60 * BROKEN_CACHE_EXPIRE_MINS:
os.remove(BROKEN_PROXIES_CACHE_FILENAME)
return []
broken_proxies = []
- with open(BROKEN_PROXIES_CACHE_FILENAME, 'r') as bpl_file:
+ with open(BROKEN_PROXIES_CACHE_FILENAME, "r") as bpl_file:
for line in bpl_file.readlines():
line = line.strip()
- if line and not line.startswith('#'):
- broken_proxy = line.split('#')[0].strip()
+ if line and not line.startswith("#"):
+ broken_proxy = line.split("#")[0].strip()
broken_proxies.append(broken_proxy)
return broken_proxies
@@ -66,18 +67,18 @@ def get_proxyscape_proxies():
Loads a list of `{ip_address}:{port}` for public proxy servers.
"""
PROXY_TIMOUT_LIMIT = "1000"
- url = 'https://api.proxyscrape.com/?request=getproxies'
- url += '&proxytype=http&country=all&ssl=yes&anonymity=all'
- url += '&timeout=' + PROXY_TIMOUT_LIMIT
+ url = "https://api.proxyscrape.com/?request=getproxies"
+ url += "&proxytype=http&country=all&ssl=yes&anonymity=all"
+ url += "&timeout=" + PROXY_TIMOUT_LIMIT
r = requests.get(url)
- return r.text.split('\r\n')
+ return r.text.split("\r\n")
def get_sslproxies_proxies():
- r = requests.get('https://sslproxies.org')
+ r = requests.get("https://sslproxies.org")
matches = re.findall(r"\d+\.\d+\.\d+\.\d+ | \d+ | ", r.text)
- revised = [m.replace('', '') for m in matches]
- proxies = [s.replace(' | ', ':')[:-1] for s in revised]
+ revised = [m.replace("", "") for m in matches]
+ proxies = [s.replace(" | ", ":")[:-1] for s in revised]
return proxies
@@ -90,8 +91,8 @@ def get_proxies(refresh=False):
if len(PROXY_LIST) == 0 or refresh:
# This is either the first run or force-refresh of the list is requested
- if os.getenv('PROXY_LIST', None):
- proxy_list = load_env_proxies() # (re)load ;-spearated list from ENV
+ if os.getenv("PROXY_LIST", None):
+ proxy_list = load_env_proxies() # (re)load ;-spearated list from ENV
else:
proxy_list = get_proxyscape_proxies()
broken_proxy_list = load_broken_proxies_cache()
@@ -102,10 +103,10 @@ def get_proxies(refresh=False):
return PROXY_LIST
-
# MAIN
################################################################################
+
def choose_proxy():
"""
Main function called externally to get a random proxy from the PROXY_LIST.
@@ -142,10 +143,10 @@ def choose_proxy():
return proxy
-
# ERROR LOGIC
################################################################################
+
def record_error_for_proxy(proxy, exception=None):
"""
Record a problem with the proxy server `proxy`, optionally passing in the
@@ -162,27 +163,27 @@ def record_error_for_proxy(proxy, exception=None):
proxy_errors = MAYBE_BROKEN_PROXIES[proxy]
recent_proxy_errors = []
for proxy_error in proxy_errors:
- if (time.time() - proxy_error['timestamp']) < ERROR_FORGET_TIME*60:
+ if (time.time() - proxy_error["timestamp"]) < ERROR_FORGET_TIME * 60:
recent_proxy_errors.append(proxy_error)
recent_proxy_errors.append(error_dict)
MAYBE_BROKEN_PROXIES[proxy] = recent_proxy_errors
if len(recent_proxy_errors) >= ERROR_THRESHOLD:
- reason = str(exception).split('\n')[0] if exception else None
+ reason = str(exception).split("\n")[0] if exception else None
add_to_broken_proxy_list(proxy, reason=reason)
else:
MAYBE_BROKEN_PROXIES[proxy] = [error_dict]
-def add_to_broken_proxy_list(proxy, reason=''):
+def add_to_broken_proxy_list(proxy, reason=""):
global BROKEN_PROXIES
if not proxy in BROKEN_PROXIES:
BROKEN_PROXIES.append(proxy)
- with open(BROKEN_PROXIES_CACHE_FILENAME, 'a') as bpl_file:
+ with open(BROKEN_PROXIES_CACHE_FILENAME, "a") as bpl_file:
line = proxy
if reason:
- line += ' # ' + str(reason)
- bpl_file.write(line + '\n')
+ line += " # " + str(reason)
+ bpl_file.write(line + "\n")
if proxy in PROXY_LIST:
PROXY_LIST.remove(proxy)
diff --git a/ricecooker/utils/subtitles.py b/ricecooker/utils/subtitles.py
index 1c03339c..4b1442ee 100644
--- a/ricecooker/utils/subtitles.py
+++ b/ricecooker/utils/subtitles.py
@@ -1,9 +1,16 @@
import codecs
-from pycaption import CaptionSet, WebVTTWriter
-from pycaption import WebVTTReader, SRTReader, SAMIReader, SCCReader, DFXPReader
-from pycaption import CaptionReadError, CaptionReadNoCaptions
-from pycaption.base import DEFAULT_LANGUAGE_CODE
+
from le_utils.constants import file_formats
+from pycaption import CaptionReadError
+from pycaption import CaptionReadNoCaptions
+from pycaption import CaptionSet
+from pycaption import DFXPReader
+from pycaption import SAMIReader
+from pycaption import SCCReader
+from pycaption import SRTReader
+from pycaption import WebVTTReader
+from pycaption import WebVTTWriter
+from pycaption.base import DEFAULT_LANGUAGE_CODE
LANGUAGE_CODE_UNKNOWN = DEFAULT_LANGUAGE_CODE
@@ -13,6 +20,7 @@ class InvalidSubtitleFormatError(TypeError):
"""
Custom error indicating a format that is invalid
"""
+
pass
@@ -20,6 +28,7 @@ class InvalidSubtitleLanguageError(ValueError):
"""
Custom error indicating that the provided language isn't present in a captions file
"""
+
pass
@@ -28,6 +37,7 @@ class SubtitleReader:
A wrapper class for the pycaption readers since the interface differs between all. This will
call read with `LANGUAGE_CODE_UNKNOWN` if `requires_language` is `True`
"""
+
def __init__(self, reader, requires_language=False):
"""
:param reader: A pycaption reader
@@ -61,9 +71,9 @@ def read(self, caption_str):
return self.reader.read(caption_str)
except CaptionReadNoCaptions:
- raise InvalidSubtitleFormatError('Caption file has no captions')
+ raise InvalidSubtitleFormatError("Caption file has no captions")
except (CaptionReadError, UnicodeDecodeError) as e:
- raise InvalidSubtitleFormatError('Caption file is invalid: {}'.format(e))
+ raise InvalidSubtitleFormatError("Caption file is invalid: {}".format(e))
# allow other errors to be passed through
@@ -71,6 +81,7 @@ class SubtitleConverter:
"""
This class converts subtitle files to the preferred VTT format
"""
+
def __init__(self, readers, caption_str):
"""
:param readers: An array of `SubtitleReader` instances
@@ -99,10 +110,12 @@ def get_caption_set(self):
break
else:
self.caption_set = None
- raise InvalidSubtitleFormatError('Subtitle file is unsupported or unreadable')
+ raise InvalidSubtitleFormatError(
+ "Subtitle file is unsupported or unreadable"
+ )
if self.caption_set.is_empty():
- raise InvalidSubtitleLanguageError('Captions set is invalid')
+ raise InvalidSubtitleLanguageError("Captions set is invalid")
return self.caption_set
def get_language_codes(self):
@@ -141,7 +154,10 @@ def replace_unknown_language(self, lang_code):
# Replace caption_set with new version, having replaced unknown language
self.caption_set = CaptionSet(
- captions, styles=dict(caption_set.get_styles()), layout_info=caption_set.layout_info)
+ captions,
+ styles=dict(caption_set.get_styles()),
+ layout_info=caption_set.layout_info,
+ )
def write(self, out_filename, lang_code):
"""
@@ -151,7 +167,7 @@ def write(self, out_filename, lang_code):
:param out_filename: A string path to put the converted captions contents
:param lang_code: A string of the language code to write
"""
- with codecs.open(out_filename, 'w', encoding='utf-8') as converted_file:
+ with codecs.open(out_filename, "w", encoding="utf-8") as converted_file:
converted_file.write(self.convert(lang_code))
def convert(self, lang_code):
@@ -168,12 +184,14 @@ def convert(self, lang_code):
if not captions:
raise InvalidSubtitleLanguageError(
- "Language '{}' is not present in caption set".format(lang_code))
+ "Language '{}' is not present in caption set".format(lang_code)
+ )
styles = caption_set.get_styles()
layout_info = caption_set.get_layout_info(lang_code)
lang_caption_set = CaptionSet(
- {lang_code: captions}, styles=dict(styles), layout_info=layout_info)
+ {lang_code: captions}, styles=dict(styles), layout_info=layout_info
+ )
return self.writer.write(lang_caption_set)
@@ -181,6 +199,7 @@ def convert(self, lang_code):
# FACTORY FUNCTIONS #
#####################
+
def build_dfxp_reader():
return SubtitleReader(DFXPReader())
@@ -213,7 +232,7 @@ def build_vtt_reader():
def build_subtitle_reader(reader_format):
if reader_format not in BUILD_READER_MAP:
- raise InvalidSubtitleFormatError('Unsupported')
+ raise InvalidSubtitleFormatError("Unsupported")
return BUILD_READER_MAP[reader_format]()
@@ -256,9 +275,7 @@ def build_subtitle_converter_from_file(captions_filename, in_format=None):
:return: A SubtitleConverter
:rtype: SubtitleConverter
"""
- with codecs.open(captions_filename, encoding='utf-8') as captions_file:
+ with codecs.open(captions_filename, encoding="utf-8") as captions_file:
captions_str = captions_file.read()
return build_subtitle_converter(captions_str, in_format)
-
-
diff --git a/ricecooker/utils/thumbscropping.py b/ricecooker/utils/thumbscropping.py
index 7ce46462..bdf9e73e 100644
--- a/ricecooker/utils/thumbscropping.py
+++ b/ricecooker/utils/thumbscropping.py
@@ -1,21 +1,22 @@
import math
import re
-from PIL import Image
import sys
import types
+from PIL import Image
+
# Useful for very coarse version differentiation.
PY2 = sys.version_info[0] == 2
PY3 = sys.version_info[0] == 3
if PY3:
- string_types = str,
- integer_types = int,
- class_types = type,
+ string_types = (str,)
+ integer_types = (int,)
+ class_types = (type,)
text_type = str
binary_type = bytes
else:
- string_types = basestring,
+ string_types = (basestring,)
integer_types = (int, long)
class_types = (type, types.ClassType)
text_type = unicode
@@ -59,7 +60,6 @@ def image_entropy(im):
return -sum([p * math.log(p, 2) for p in hist if p != 0])
-
def _compare_entropy(start_slice, end_slice, slice, difference):
"""
Calculate the entropy of two slices (from the start and end of an axis),
@@ -80,8 +80,9 @@ def _compare_entropy(start_slice, end_slice, slice, difference):
return slice, 0
-
-def scale_and_crop(im, size, crop=False, upscale=False, zoom=None, target=None, **kwargs):
+def scale_and_crop(
+ im, size, crop=False, upscale=False, zoom=None, target=None, **kwargs
+):
"""
Handle scaling and cropping the source image.
Images can be scaled / cropped against a single dimension by using zero
@@ -148,9 +149,10 @@ def scale_and_crop(im, size, crop=False, upscale=False, zoom=None, target=None,
if scale < 1.0 or (scale > 1.0 and upscale):
# Resize the image to the target size boundary. Round the scaled
# boundary sizes to avoid floating point errors.
- im = im.resize((int(round(source_x * scale)),
- int(round(source_y * scale))),
- resample=Image.ANTIALIAS)
+ im = im.resize(
+ (int(round(source_x * scale)), int(round(source_y * scale))),
+ resample=Image.ANTIALIAS,
+ )
if crop:
# Use integer values now.
@@ -158,9 +160,9 @@ def scale_and_crop(im, size, crop=False, upscale=False, zoom=None, target=None,
# Difference between new image size and requested size.
diff_x = int(source_x - min(source_x, target_x))
diff_y = int(source_y - min(source_y, target_y))
- if crop != 'scale' and (diff_x or diff_y):
+ if crop != "scale" and (diff_x or diff_y):
if isinstance(target, string_types):
- target = re.match(r'(\d+)?,(\d+)?$', target)
+ target = re.match(r"(\d+)?,(\d+)?$", target)
if target:
target = target.groups()
if target:
@@ -178,8 +180,9 @@ def scale_and_crop(im, size, crop=False, upscale=False, zoom=None, target=None,
box.append(int(min(source_x, box[0] + target_x)))
box.append(int(min(source_y, box[1] + target_y)))
# See if an edge cropping argument was provided.
- edge_crop = (isinstance(crop, string_types) and
- re.match(r'(?:(-?)(\d+))?,(?:(-?)(\d+))?$', crop))
+ edge_crop = isinstance(crop, string_types) and re.match(
+ r"(?:(-?)(\d+))?,(?:(-?)(\d+))?$", crop
+ )
if edge_crop and filter(None, edge_crop.groups()):
x_right, x_crop, y_bottom, y_crop = edge_crop.groups()
if x_crop:
@@ -199,7 +202,7 @@ def scale_and_crop(im, size, crop=False, upscale=False, zoom=None, target=None,
box[1] = offset
box[3] = source_y - (diff_y - offset)
# See if the image should be "smart cropped".
- elif crop == 'smart':
+ elif crop == "smart":
left = top = 0
right, bottom = source_x, source_y
while diff_x:
diff --git a/ricecooker/utils/tokens.py b/ricecooker/utils/tokens.py
index e57a1646..d2623ac4 100644
--- a/ricecooker/utils/tokens.py
+++ b/ricecooker/utils/tokens.py
@@ -1,9 +1,8 @@
-
import json
import os
import sys
-try: # to support Python 2.x.
+try: # to support Python 2.x.
input = raw_input
except NameError:
pass
@@ -22,6 +21,7 @@ def get_env(envvar):
else:
return os.environ[envvar]
+
def get_content_curation_token(args_token):
"""
Get the token through one of four possible ways. Input `args_token` can be
@@ -31,18 +31,19 @@ def get_content_curation_token(args_token):
3a. if environment variable STUDIO_TOKEN exists, we'll use that
3b. else we prompt the user interactively
"""
- if args_token != "#": # retrieval methods 1, 2
+ if args_token != "#": # retrieval methods 1, 2
if os.path.isfile(args_token):
- with open(args_token, 'r') as fobj:
+ with open(args_token, "r") as fobj:
return fobj.read().strip()
else:
return args_token
- else: # retrieval strategies 3
- token = get_env('STUDIO_TOKEN') or get_env('CONTENT_CURATION_TOKEN')
+ else: # retrieval strategies 3
+ token = get_env("STUDIO_TOKEN") or get_env("CONTENT_CURATION_TOKEN")
if token is not None:
- return token # 3a
+ return token # 3a
else:
- return prompt_token(config.DOMAIN) # 3b
+ return prompt_token(config.DOMAIN) # 3b
+
def prompt_token(domain):
"""
@@ -51,9 +52,10 @@ def prompt_token(domain):
Returns: token
"""
token = input("\nEnter content curation server token ('q' to quit): ").lower()
- if token == 'q':
+ if token == "q":
sys.exit()
else:
return token.strip()
+
# SUSHI_BAR_TOKEN = get_env('SUSHI_BAR_TOKEN') # TODO in near future
diff --git a/ricecooker/utils/utils.py b/ricecooker/utils/utils.py
index d8c17c16..f4d5ee8f 100644
--- a/ricecooker/utils/utils.py
+++ b/ricecooker/utils/utils.py
@@ -17,4 +17,8 @@ def make_dir_if_needed(path):
class VideoURLFormatError(Exception):
def __init__(self, url, expected_format):
- self.message = "The video at {} does not appear to be a proper {} video URL.".format(url, expected_format)
\ No newline at end of file
+ self.message = (
+ "The video at {} does not appear to be a proper {} video URL.".format(
+ url, expected_format
+ )
+ )
diff --git a/ricecooker/utils/videos.py b/ricecooker/utils/videos.py
index 0c43c6fc..8bb9d893 100644
--- a/ricecooker/utils/videos.py
+++ b/ricecooker/utils/videos.py
@@ -1,6 +1,6 @@
+import logging
import re
import subprocess
-import logging
from le_utils.constants import format_presets
@@ -9,6 +9,7 @@
LOGGER = logging.getLogger("VideoResource")
LOGGER.setLevel(logging.DEBUG)
+
def guess_video_preset_by_resolution(videopath):
"""
Run `ffprobe` to find resolution classify as high resolution (video height >= 720),
@@ -17,19 +18,31 @@ def guess_video_preset_by_resolution(videopath):
"""
try:
LOGGER.debug("Entering 'guess_video_preset_by_resolution' method")
- result = subprocess.check_output(['ffprobe', '-v', 'error', '-print_format', 'json', '-show_entries',
- 'stream=width,height', '-of', 'default=noprint_wrappers=1', str(videopath)])
+ result = subprocess.check_output(
+ [
+ "ffprobe",
+ "-v",
+ "error",
+ "-print_format",
+ "json",
+ "-show_entries",
+ "stream=width,height",
+ "-of",
+ "default=noprint_wrappers=1",
+ str(videopath),
+ ]
+ )
LOGGER.debug("ffprobe stream result = {}".format(result))
- pattern = re.compile('width=([0-9]*)[^height]+height=([0-9]*)')
+ pattern = re.compile("width=([0-9]*)[^height]+height=([0-9]*)")
match = pattern.search(str(result))
if match is None:
return format_presets.VIDEO_LOW_RES
width, height = int(match.group(1)), int(match.group(2))
if height >= 720:
- LOGGER.info('Video preset from {} = high resolution'.format(videopath))
+ LOGGER.info("Video preset from {} = high resolution".format(videopath))
return format_presets.VIDEO_HIGH_RES
else:
- LOGGER.info('Video preset from {} = low resolution'.format(videopath))
+ LOGGER.info("Video preset from {} = low resolution".format(videopath))
return format_presets.VIDEO_LOW_RES
except Exception as e:
LOGGER.warning(e)
@@ -42,14 +55,44 @@ def extract_thumbnail_from_video(fpath_in, fpath_out, overwrite=False):
The thumbnail image will be written in the file object given in `fobj_out`.
"""
try:
- result = subprocess.check_output(['ffprobe', '-v', 'error', '-show_entries', 'format=duration', '-of',
- 'default=noprint_wrappers=1:nokey=1', "-loglevel", "panic", str(fpath_in)])
+ result = subprocess.check_output(
+ [
+ "ffprobe",
+ "-v",
+ "error",
+ "-show_entries",
+ "format=duration",
+ "-of",
+ "default=noprint_wrappers=1:nokey=1",
+ "-loglevel",
+ "panic",
+ str(fpath_in),
+ ]
+ )
midpoint = float(re.search("\d+\.\d+", str(result)).group()) / 2
# scale parameters are from https://trac.ffmpeg.org/wiki/Scaling
scale = "scale=400:225:force_original_aspect_ratio=decrease,pad=400:225:(ow-iw)/2:(oh-ih)/2"
- command = ['ffmpeg',"-y" if overwrite else "-n", '-i', str(fpath_in), "-vf", scale, "-vcodec", "png", "-nostats",
- '-ss', str(midpoint), '-vframes', '1', '-q:v', '2', "-loglevel", "panic", str(fpath_out)]
+ command = [
+ "ffmpeg",
+ "-y" if overwrite else "-n",
+ "-i",
+ str(fpath_in),
+ "-vf",
+ scale,
+ "-vcodec",
+ "png",
+ "-nostats",
+ "-ss",
+ str(midpoint),
+ "-vframes",
+ "1",
+ "-q:v",
+ "2",
+ "-loglevel",
+ "panic",
+ str(fpath_out),
+ ]
subprocess.check_output(command, stderr=subprocess.STDOUT)
except subprocess.CalledProcessError as e:
raise ThumbnailGenerationError("{}: {}".format(e, e.output))
@@ -59,6 +102,7 @@ class VideoCompressionError(Exception):
"""
Custom error returned when `ffmpeg` compression exits with a non-zero status.
"""
+
pass
@@ -74,20 +118,47 @@ def compress_video(source_file_path, target_file, overwrite=False, **kwargs):
# The output width and height for ffmpeg scale param must be divisible by 2
# using value -2 to get robust behaviour: maintains the aspect ratio and also
# ensure the calculated dimension is divisible by 2
- if 'max_width' in kwargs:
- scale = "'w=trunc(min(iw,{max_width})/2)*2:h=-2'".format(max_width=kwargs['max_width'])
- elif 'max_height' in kwargs:
- scale = "'w=-2:h=trunc(min(ih,{max_height})/2)*2'".format(max_height=kwargs['max_height'])
+ if "max_width" in kwargs:
+ scale = "'w=trunc(min(iw,{max_width})/2)*2:h=-2'".format(
+ max_width=kwargs["max_width"]
+ )
+ elif "max_height" in kwargs:
+ scale = "'w=-2:h=trunc(min(ih,{max_height})/2)*2'".format(
+ max_height=kwargs["max_height"]
+ )
else:
scale = "'w=-2:h=trunc(min(ih,480)/2)*2'" # default to max-height 480px
# set constant rate factor, see https://trac.ffmpeg.org/wiki/Encode/H.264#crf
- crf = kwargs['crf'] if 'crf' in kwargs else 32
+ crf = kwargs["crf"] if "crf" in kwargs else 32
# run command
- command = ["ffmpeg", "-y" if overwrite else "-n", "-i", source_file_path, "-profile:v", "baseline",
- "-level", "3.0", "-b:a", "32k", "-ac", "1", "-vf", "scale={}".format(scale),
- "-crf", str(crf), "-preset", "slow", "-v", "error", "-strict", "-2", "-stats", target_file]
+ command = [
+ "ffmpeg",
+ "-y" if overwrite else "-n",
+ "-i",
+ source_file_path,
+ "-profile:v",
+ "baseline",
+ "-level",
+ "3.0",
+ "-b:a",
+ "32k",
+ "-ac",
+ "1",
+ "-vf",
+ "scale={}".format(scale),
+ "-crf",
+ str(crf),
+ "-preset",
+ "slow",
+ "-v",
+ "error",
+ "-strict",
+ "-2",
+ "-stats",
+ target_file,
+ ]
try:
subprocess.check_output(command, stderr=subprocess.STDOUT)
except subprocess.CalledProcessError as e:
diff --git a/ricecooker/utils/web.py b/ricecooker/utils/web.py
index e72ee033..095e1f75 100644
--- a/ricecooker/utils/web.py
+++ b/ricecooker/utils/web.py
@@ -3,7 +3,6 @@
Note that we could not use html for the module name as recent versions of Python
include their own html module.
"""
-
import os
from bs4 import BeautifulSoup
@@ -13,15 +12,16 @@ class HTMLParser:
"""
HTMLParser contains a set of functions for parsing, scraping, and updating an HTML page.
"""
+
def __init__(self, filename=None, html=None):
self.filename = filename
self.html = html
self.link_tags = {
- 'a': 'href',
- 'audio': 'src',
- 'img': 'src',
- 'link': 'href',
- 'script': 'src'
+ "a": "href",
+ "audio": "src",
+ "img": "src",
+ "link": "href",
+ "script": "src",
}
def get_links(self):
@@ -34,7 +34,7 @@ def get_links(self):
if self.html is None:
basename = os.path.basename(self.filename)
self.html = open(self.filename).read()
- soup = BeautifulSoup(self.html, 'html.parser')
+ soup = BeautifulSoup(self.html, "html.parser")
extracted_links = []
for tag_name in self.link_tags:
@@ -43,11 +43,15 @@ def get_links(self):
link = tag.get(self.link_tags[tag_name])
# don't include links to ourselves or # links
# TODO: Should this part be moved to get_local_files instead?
- if link and (basename and not link.startswith(basename)) and not link.strip().startswith("#"):
- if '?' in link:
- link, query = link.split('?')
- if '#' in link:
- link, marker = link.split('#')
+ if (
+ link
+ and (basename and not link.startswith(basename))
+ and not link.strip().startswith("#")
+ ):
+ if "?" in link:
+ link, query = link.split("?")
+ if "#" in link:
+ link, marker = link.split("#")
extracted_links.append(link)
return extracted_links
@@ -63,7 +67,7 @@ def get_local_files(self):
for link in links:
# NOTE: This technically fails to handle file:// URLs, but we're highly unlikely to see
# file:// URLs in any distributed package, so this is simpler than parsing out the protocol.
- if not '://' in link:
+ if not "://" in link:
local_links.append(link)
return local_links
@@ -78,7 +82,7 @@ def replace_links(self, links_to_replace):
if self.html is None:
basename = os.path.basename(self.filename)
self.html = open(self.filename).read()
- soup = BeautifulSoup(self.html, 'html.parser')
+ soup = BeautifulSoup(self.html, "html.parser")
extracted_links = []
for tag_name in self.link_tags:
@@ -88,4 +92,4 @@ def replace_links(self, links_to_replace):
if link in links_to_replace:
tag[self.link_tags[tag_name]] = links_to_replace[link]
- return soup.prettify()
\ No newline at end of file
+ return soup.prettify()
diff --git a/ricecooker/utils/youtube.py b/ricecooker/utils/youtube.py
index e933fd09..3274ab42 100644
--- a/ricecooker/utils/youtube.py
+++ b/ricecooker/utils/youtube.py
@@ -1,18 +1,18 @@
-from enum import Enum
import copy
import json
import logging
import os
-import time
import re
-import youtube_dl
-
+import time
from datetime import datetime
+from enum import Enum
+
+import youtube_dl
from le_utils.constants import languages
-from ricecooker.config import LOGGER
from . import proxy
from . import utils
+from ricecooker.config import LOGGER
LOGGER = logging.getLogger("YouTubeResource")
@@ -20,8 +20,8 @@
NON_NETWORK_ERRORS = [
- youtube_dl.utils.ExtractorError, # private and unlisted videos
- youtube_dl.utils.PostProcessingError, # custom postprocessors failures
+ youtube_dl.utils.ExtractorError, # private and unlisted videos
+ youtube_dl.utils.PostProcessingError, # custom postprocessors failures
]
@@ -42,6 +42,7 @@ class YouTubeResource(object):
This class encapsulates functionality for information retrieval and download
of YouTube resources. Resources may include videos, playlists and channels.
"""
+
# If extract_info request takes longer than this we treat it as broken proxy
EXTRACT_TIME_SLOW_LIMIT = 20 # in seconds
@@ -51,22 +52,18 @@ def __init__(self, url, useproxy=True, high_resolution=False, options=None):
:param url: URL of a YouTube resource. URL may point to a video, playlist or channel.
"""
- if not 'youtube.com' in url and not 'youtu.be' in url:
- raise utils.VideoURLFormatError(url, 'YouTube')
+ if not "youtube.com" in url and not "youtu.be" in url:
+ raise utils.VideoURLFormatError(url, "YouTube")
self.url = url
self.subtitles = {}
self.num_retries = 10
self.sleep_seconds = 0.5
- self.preferred_formats = {
- 'video': 'mp4',
- 'audio': 'm4a'
- }
+ self.preferred_formats = {"video": "mp4", "audio": "m4a"}
self.useproxy = useproxy
self.high_resolution = high_resolution
self.options = options
self.client = None # this will become a YoutubeDL instance on first use
- self.info = None # save detailed info_dict returned from extract_info
-
+ self.info = None # save detailed info_dict returned from extract_info
def get_resource_info(self, options=None):
"""
@@ -75,26 +72,26 @@ def get_resource_info(self, options=None):
:return: A ricecooker-like dict of info about the channel, playlist or video.
"""
extract_info_options = dict(
- verbose = True, # TODO(ivan) change this to quiet = True eventually
- no_warnings = True,
- no_color = True,
+ verbose=True, # TODO(ivan) change this to quiet = True eventually
+ no_warnings=True,
+ no_color=True,
# By default, YouTubeDL will pick what it determines to be the best formats, but for consistency's sake
# we want to always get preferred formats (default of mp4 and m4a) when possible.
- format = "bestvideo[height<={maxheight}][ext={vext}]+bestaudio[ext={aext}]/best[height<={maxheight}][ext={vext}]".format(
+ format="bestvideo[height<={maxheight}][ext={vext}]+bestaudio[ext={aext}]/best[height<={maxheight}][ext={vext}]".format(
maxheight=720 if self.high_resolution else 480,
- vext=self.preferred_formats['video'],
- aext=self.preferred_formats['audio']
+ vext=self.preferred_formats["video"],
+ aext=self.preferred_formats["audio"],
),
)
for i in range(self.num_retries):
if self.useproxy:
dl_proxy = proxy.choose_proxy()
- extract_info_options['proxy'] = dl_proxy
+ extract_info_options["proxy"] = dl_proxy
if self.options:
extract_info_options.update(self.options) # init-time options
if options:
- extract_info_options.update(options) # additional options
+ extract_info_options.update(options) # additional options
try:
LOGGER.debug("YoutubeDL options = {}".format(extract_info_options))
@@ -103,17 +100,22 @@ def get_resource_info(self, options=None):
LOGGER.debug("Calling extract_info for URL {}".format(self.url))
start_time = datetime.now()
- self.info = self.client.extract_info(self.url, download=False, process=True)
+ self.info = self.client.extract_info(
+ self.url, download=False, process=True
+ )
end_time = datetime.now()
# Mark slow proxies as broken
extract_time = (end_time - start_time).total_seconds()
- LOGGER.debug('extract_time = ' + str(extract_time))
+ LOGGER.debug("extract_time = " + str(extract_time))
if self.useproxy and extract_time > self.EXTRACT_TIME_SLOW_LIMIT:
- if 'entries' in self.info:
+ if "entries" in self.info:
pass # it's OK for extract_info to be slow for playlists
else:
- proxy.record_error_for_proxy(dl_proxy, exception='extract_info took ' + extract_time + ' seconds')
+ proxy.record_error_for_proxy(
+ dl_proxy,
+ exception="extract_info took " + extract_time + " seconds",
+ )
LOGGER.info("Found slow proxy {}".format(dl_proxy))
# Format info JSON into ricecooker-like keys
@@ -134,7 +136,6 @@ def get_resource_info(self, options=None):
LOGGER.warning("Info extraction failed, retrying...")
time.sleep(self.sleep_seconds)
-
def get_dir_name_from_url(self, url=None):
"""
Takes a URL and returns a directory name to store files in.
@@ -148,7 +149,6 @@ def get_dir_name_from_url(self, url=None):
name = name.split("?")[0]
return " ".join(name.split("_")).title()
-
def download(self, base_path=None, useproxy=False, options=None):
"""
Download the YouTube resource(s) specified in `self.info`. If `self.info`
@@ -159,17 +159,17 @@ def download(self, base_path=None, useproxy=False, options=None):
download_dir = os.path.join(base_path, self.get_dir_name_from_url())
utils.make_dir_if_needed(download_dir)
else:
- download_dir = '.'
+ download_dir = "."
if self.client is None or self.info is None:
# download should always be called after self.info is available
self.get_resource_info()
# Set reasonable default download options...
- self.client.params['outtmpl'] = '{}/%(id)s.%(ext)s'.format(download_dir)
- self.client.params['writethumbnail'] = True # TODO(ivan): revisit this
- self.client.params['continuedl'] = False # clean start to avoid errors
- self.client.params['noprogress'] = True # progressbar doesn't log well
+ self.client.params["outtmpl"] = "{}/%(id)s.%(ext)s".format(download_dir)
+ self.client.params["writethumbnail"] = True # TODO(ivan): revisit this
+ self.client.params["continuedl"] = False # clean start to avoid errors
+ self.client.params["noprogress"] = True # progressbar doesn't log well
if options:
# ...but override them based on user choices when specified
self.client.params.update(options)
@@ -182,16 +182,20 @@ def download(self, base_path=None, useproxy=False, options=None):
if useproxy:
# If useproxy ovverride specified, choose a new proxy server:
dl_proxy = proxy.choose_proxy()
- self.client.params['proxy'] = dl_proxy
+ self.client.params["proxy"] = dl_proxy
self.client._setup_opener() # this will re-initialize downloader
- elif not useproxy and 'proxy' in self.client.params and self.client.params['proxy']:
+ elif (
+ not useproxy
+ and "proxy" in self.client.params
+ and self.client.params["proxy"]
+ ):
# Disable proxy if it was used for the get_resource_info call
- self.client.params['proxy'] = None
+ self.client.params["proxy"] = None
self.client._setup_opener() # this will re-initialize downloader
try:
self.info = self.client.process_ie_result(self.info, download=True)
- LOGGER.debug('Finished process_ie_result successfully')
+ LOGGER.debug("Finished process_ie_result successfully")
break
except Exception as e:
network_related_error = True
@@ -209,25 +213,24 @@ def download(self, base_path=None, useproxy=False, options=None):
os.remove(download_filename)
LOGGER.warning(e)
if i < self.num_retries - 1:
- LOGGER.warning("Download {} failed, retrying...".format(i+1))
+ LOGGER.warning("Download {} failed, retrying...".format(i + 1))
time.sleep(self.sleep_seconds)
# Post-process results
# TODO(ivan): handle post processing filename when custom `outtmpl` specified in options
if self.info:
edited_results = self._format_for_ricecooker(self.info)
- if 'children' in edited_results:
- for child in edited_results['children']:
- vfilename = "{}.{}".format(child["id"], child['ext'])
- child['filename'] = os.path.join(download_dir, vfilename)
+ if "children" in edited_results:
+ for child in edited_results["children"]:
+ vfilename = "{}.{}".format(child["id"], child["ext"])
+ child["filename"] = os.path.join(download_dir, vfilename)
else:
- vfilename = "{}.{}".format(edited_results["id"], edited_results['ext'])
- edited_results['filename'] = os.path.join(download_dir, vfilename)
+ vfilename = "{}.{}".format(edited_results["id"], edited_results["ext"])
+ edited_results["filename"] = os.path.join(download_dir, vfilename)
return edited_results
else:
return None
-
def get_resource_subtitles(self, options=None):
"""
Retrieves the subtitles for the video(s) represented by this resource.
@@ -237,9 +240,9 @@ def get_resource_subtitles(self, options=None):
:return: A dictionary object that contains information about video subtitles
"""
options_for_subtitles = dict(
- writesubtitles = True, # extract subtitles info
- allsubtitles = True, # get all available languages
- writeautomaticsub = False, # do not include auto-generated subs
+ writesubtitles=True, # extract subtitles info
+ allsubtitles=True, # get all available languages
+ writeautomaticsub=False, # do not include auto-generated subs
)
if options:
options_for_subtitles.update(options)
@@ -247,7 +250,6 @@ def get_resource_subtitles(self, options=None):
info = self.get_resource_info(options=options_for_subtitles)
return info
-
def _format_for_ricecooker(self, results):
"""
Internal method for converting YouTube resource info into the format expected by ricecooker.
@@ -260,42 +262,41 @@ def _format_for_ricecooker(self, results):
# dict mapping of field name and default value when not found.
extracted_fields = {
- 'id': '',
- 'title': '',
- 'description': '',
- 'ext': 'mp4',
- 'thumbnail': '',
- 'webpage_url': '',
- 'tags': [],
- 'subtitles': {},
- 'requested_subtitles': '',
- 'artist': '',
- 'license': '',
- '_type': 'video'
+ "id": "",
+ "title": "",
+ "description": "",
+ "ext": "mp4",
+ "thumbnail": "",
+ "webpage_url": "",
+ "tags": [],
+ "subtitles": {},
+ "requested_subtitles": "",
+ "artist": "",
+ "license": "",
+ "_type": "video",
}
for field_name in extracted_fields:
info_name = field_name
- if info_name == '_type':
- info_name = 'kind'
- elif info_name == 'webpage_url':
- info_name = 'source_url'
+ if info_name == "_type":
+ info_name = "kind"
+ elif info_name == "webpage_url":
+ info_name = "source_url"
if field_name in results:
leaf[info_name] = results[field_name]
else:
leaf[info_name] = extracted_fields[field_name]
- if 'entries' in results:
- leaf['children'] = []
- for entry in results['entries']:
+ if "entries" in results:
+ leaf["children"] = []
+ for entry in results["entries"]:
if entry is not None:
- leaf['children'].append(self._format_for_ricecooker(entry))
+ leaf["children"].append(self._format_for_ricecooker(entry))
else:
LOGGER.info("Skipping None entry bcs failed extract info")
return leaf
-
def check_for_content_issues(self, filter=False):
"""
Checks the YouTube resource and looks for any issues that may prevent download or distribution of the material,
@@ -309,28 +310,27 @@ def check_for_content_issues(self, filter=False):
output_video_info = copy.copy(resource_info)
videos_with_warnings = []
if filter:
- output_video_info['children'] = []
+ output_video_info["children"] = []
- for video in resource_info['children']:
+ for video in resource_info["children"]:
warnings = []
- if not video['license']:
- warnings.append('no_license_specified')
- elif video['license'].find("Creative Commons") == -1:
- warnings.append('closed_license')
+ if not video["license"]:
+ warnings.append("no_license_specified")
+ elif video["license"].find("Creative Commons") == -1:
+ warnings.append("closed_license")
if len(warnings) > 0:
- videos_with_warnings.append({'video': video, 'warnings': warnings})
+ videos_with_warnings.append({"video": video, "warnings": warnings})
elif filter:
- output_video_info['children'].append(video)
+ output_video_info["children"].append(video)
return videos_with_warnings, output_video_info
-
-
# YOUTUBE LANGUAGE CODE HELPERS
################################################################################
+
def get_language_with_alpha2_fallback(language_code):
"""
Lookup language code `language_code` (string) in the internal language codes,
@@ -356,47 +356,48 @@ def is_youtube_subtitle_file_supported_language(language):
"""
language_obj = get_language_with_alpha2_fallback(language)
if language_obj is None:
- print('Found unsupported language code {}'.format(language))
+ print("Found unsupported language code {}".format(language))
return False
else:
return True
-
# CONSTANTS for YouTube cache
################################################################################
-CHEFDATA_DIR = 'chefdata'
-DEFAULT_YOUTUBE_CACHE_DIR = os.path.join(CHEFDATA_DIR, 'youtubecache')
+CHEFDATA_DIR = "chefdata"
+DEFAULT_YOUTUBE_CACHE_DIR = os.path.join(CHEFDATA_DIR, "youtubecache")
# CONSTANTS for YouTube resources
################################################################################
YOUTUBE_VIDEO_REGEX = re.compile(
- r'(https?://)?(www\.)?(youtube|youtu|youtube-nocookie)\.(com|be)/(watch\?v=|embed/|v/|.+\?v=)?(?P[A-Za-z0-9\-=_]{11})'
+ r"(https?://)?(www\.)?(youtube|youtu|youtube-nocookie)\.(com|be)/(watch\?v=|embed/|v/|.+\?v=)?(?P[A-Za-z0-9\-=_]{11})"
)
YOUTUBE_PLAYLIST_URL_FORMAT = "https://www.youtube.com/playlist?list={0}"
YOUTUBE_VIDEO_URL_FORMAT = "https://www.youtube.com/watch?v={0}"
+
class YouTubeTypes(Enum):
"""
Enum containing YouTube resource types
"""
+
YOUTUBE_BASE = "YouTubeBase"
YOUTUBE_VIDEO = "YouTubeVideo"
YOUTUBE_PLAYLIST = "YouTubePlayList"
YOUTUBE_CHANNEL = "YouTubeChannel"
-class YouTubeUtils(object):
+class YouTubeUtils(object):
def __init__(self, id, type=YouTubeTypes.YOUTUBE_BASE):
self.id = id
self.type = type
- self.cache_dir = ''
- self.cache_path = ''
- self.url = ''
+ self.cache_dir = ""
+ self.cache_path = ""
+ self.url = ""
def __str__(self):
- return '%s (%s)' % (self.type, self.cachename)
+ return "%s (%s)" % (self.type, self.cachename)
def _get_youtube_info(self, use_proxy=True, use_cache=True, options=None):
youtube_info = None
@@ -412,7 +413,11 @@ def _get_youtube_info(self, use_proxy=True, use_cache=True, options=None):
youtube_resource = YouTubeResource(self.url, useproxy=use_proxy)
except youtube_dl.utils.ExtractorError as e:
if "unavailable" in str(e):
- LOGGER.error("==> [%s] Resource unavailable for URL: %s", self.__str__, self.url)
+ LOGGER.error(
+ "==> [%s] Resource unavailable for URL: %s",
+ self.__str__,
+ self.url,
+ )
return None
if youtube_resource:
@@ -420,24 +425,30 @@ def _get_youtube_info(self, use_proxy=True, use_cache=True, options=None):
# Save YouTube info to JSON cache file
youtube_info = youtube_resource.get_resource_info(options)
if youtube_info:
- json.dump(youtube_info,
- open(self.cache_path, 'w'),
- indent=4,
- ensure_ascii=False,
- sort_keys=True)
+ json.dump(
+ youtube_info,
+ open(self.cache_path, "w"),
+ indent=4,
+ ensure_ascii=False,
+ sort_keys=True,
+ )
else:
- LOGGER.error("==> [%s] Failed to extract YouTube info", self.__str__())
+ LOGGER.error(
+ "==> [%s] Failed to extract YouTube info", self.__str__()
+ )
except Exception as e:
- LOGGER.error("==> [%s] Failed to get YouTube info: %s", self.__str__(), e)
+ LOGGER.error(
+ "==> [%s] Failed to get YouTube info: %s", self.__str__(), e
+ )
return None
return youtube_info
-class YouTubeVideoUtils(YouTubeUtils):
+class YouTubeVideoUtils(YouTubeUtils):
def __init_subclass__(cls):
return super().__init_subclass__()
- def __init__(self, id, alias='', cache_dir=''):
+ def __init__(self, id, alias="", cache_dir=""):
"""
Initializes YouTubeVideoUtils object with id
:param id: YouTube video ID
@@ -453,9 +464,11 @@ def __init__(self, id, alias='', cache_dir=''):
self.cache_dir = DEFAULT_YOUTUBE_CACHE_DIR
else:
self.cache_dir = cache_dir
- self.cache_path = os.path.join(self.cache_dir, self.cachename + '.json')
+ self.cache_path = os.path.join(self.cache_dir, self.cachename + ".json")
- def get_video_info(self, use_proxy=True, use_cache=True, get_subtitle_languages=False, options=None):
+ def get_video_info(
+ self, use_proxy=True, use_cache=True, get_subtitle_languages=False, options=None
+ ):
"""
Get YouTube video info by either requesting URL or extracting local cache
:param use_cache: Define if allowed to get video info from local JSON cache, default to True
@@ -467,18 +480,20 @@ def get_video_info(self, use_proxy=True, use_cache=True, get_subtitle_languages=
extract_options = dict()
if get_subtitle_languages:
options_for_subtitles = dict(
- writesubtitles=True, # extract subtitles info
- allsubtitles=True, # get all available languages
+ writesubtitles=True, # extract subtitles info
+ allsubtitles=True, # get all available languages
writeautomaticsub=False, # do not include auto-generated subs
)
extract_options.update(options_for_subtitles)
if options:
extract_options.update(options)
- return self._get_youtube_info(use_proxy=use_proxy, use_cache=use_cache, options=extract_options)
+ return self._get_youtube_info(
+ use_proxy=use_proxy, use_cache=use_cache, options=extract_options
+ )
-class YouTubePlaylistUtils(YouTubeUtils):
- def __init__(self, id, alias='', cache_dir=''):
+class YouTubePlaylistUtils(YouTubeUtils):
+ def __init__(self, id, alias="", cache_dir=""):
"""
Initializes YouTubePlaylistUtils object with id
:param id: YouTube playlist ID
@@ -494,9 +509,11 @@ def __init__(self, id, alias='', cache_dir=''):
self.cache_dir = DEFAULT_YOUTUBE_CACHE_DIR
else:
self.cache_dir = cache_dir
- self.cache_path = os.path.join(self.cache_dir, self.cachename + '.json')
+ self.cache_path = os.path.join(self.cache_dir, self.cachename + ".json")
- def get_playlist_info(self, use_proxy=True, use_cache=True, youtube_skip_download=True, options=None):
+ def get_playlist_info(
+ self, use_proxy=True, use_cache=True, youtube_skip_download=True, options=None
+ ):
"""
Get YouTube playlist info by either requesting URL or extracting local cache
:param use_cache: Define if allowed to get playlist info from local JSON cache, default to True
@@ -506,9 +523,10 @@ def get_playlist_info(self, use_proxy=True, use_cache=True, youtube_skip_downloa
:return: A ricecooker-like info dict info about the playlist or None if extraction fails
"""
youtube_extract_options = dict(
- skip_download=youtube_skip_download,
- extract_flat=True
+ skip_download=youtube_skip_download, extract_flat=True
)
if options:
youtube_extract_options.update(options)
- return self._get_youtube_info(use_proxy=use_proxy, use_cache=use_cache, options=youtube_extract_options)
+ return self._get_youtube_info(
+ use_proxy=use_proxy, use_cache=use_cache, options=youtube_extract_options
+ )
diff --git a/ricecooker/utils/zip.py b/ricecooker/utils/zip.py
index b7478ed9..1981852d 100644
--- a/ricecooker/utils/zip.py
+++ b/ricecooker/utils/zip.py
@@ -39,7 +39,9 @@ def create_predictable_zip(path, entrypoint=None):
f.close()
for root, directories, filenames in os.walk(path):
- paths += [os.path.join(root, filename)[len(path)+1:] for filename in filenames]
+ paths += [
+ os.path.join(root, filename)[len(path) + 1 :] for filename in filenames
+ ]
reader = lambda x: _read_file(os.path.join(path, x))
# otherwise, if it's a zip file, open it up and pull out the list of names
elif os.path.isfile(path) and os.path.splitext(path)[1] == ".zip":
@@ -55,7 +57,9 @@ def create_predictable_zip(path, entrypoint=None):
with zipfile.ZipFile(zippath, "w") as outputzip:
# loop over the file paths in sorted order, to ensure a predictable zip
for filepath in sorted(paths):
- write_file_to_zip_with_neutral_metadata(outputzip, filepath, reader(filepath))
+ write_file_to_zip_with_neutral_metadata(
+ outputzip, filepath, reader(filepath)
+ )
os.fdopen(zippathfd).close()
return zippath
@@ -74,4 +78,3 @@ def write_file_to_zip_with_neutral_metadata(zfile, filename, content):
info.comment = "".encode()
info.create_system = 0
zfile.writestr(info, content)
-
diff --git a/setup.py b/setup.py
index ed24e053..a94fd017 100644
--- a/setup.py
+++ b/setup.py
@@ -6,23 +6,23 @@
import ricecooker
-readme = open('README.md').read()
+readme = open("README.md").read()
-with open('docs/history.rst') as history_file:
+with open("docs/history.rst") as history_file:
history = history_file.read()
requirements = [
"pytest>=3.0.2",
"requests>=2.11.1",
"le_utils>=0.1.26",
- "validators", # TODO: check if this is necessary
+ "validators", # TODO: check if this is necessary
"requests_file",
- "beautifulsoup4>=4.6.3,<4.9.0", # pinned to match versions in le-pycaption
+ "beautifulsoup4>=4.6.3,<4.9.0", # pinned to match versions in le-pycaption
"selenium==3.0.1",
"youtube-dl>=2020.6.16.1",
"html5lib",
"cachecontrol==0.12.0",
- "lockfile==0.12.2", # TODO: check if this is necessary
+ "lockfile==0.12.2", # TODO: check if this is necessary
"css-html-js-minify==2.2.2",
"mock==2.0.0",
"pypdf2>=1.26.0",
@@ -40,36 +40,36 @@
setup(
- name='ricecooker',
+ name="ricecooker",
version=ricecooker.__version__,
description="API for adding content to the Kolibri content curation server",
- long_description=readme + '\n\n' + history,
- long_description_content_type='text/markdown',
+ long_description=readme + "\n\n" + history,
+ long_description_content_type="text/markdown",
author="Learning Equality",
- author_email='dev@learningequality.org',
- url='https://github.com/learningequality/ricecooker',
+ author_email="dev@learningequality.org",
+ url="https://github.com/learningequality/ricecooker",
packages=find_packages(),
- package_dir={'ricecooker':'ricecooker'},
- entry_points = {
- 'console_scripts': [
- 'corrections = ricecooker.utils.corrections:correctionsmain',
- 'jiro = ricecooker.cli:main'
+ package_dir={"ricecooker": "ricecooker"},
+ entry_points={
+ "console_scripts": [
+ "corrections = ricecooker.utils.corrections:correctionsmain",
+ "jiro = ricecooker.cli:main",
],
},
include_package_data=True,
install_requires=requirements,
license="MIT license",
zip_safe=False,
- keywords='ricecooker',
+ keywords="ricecooker",
classifiers=[
- 'Intended Audience :: Developers',
- 'Development Status :: 5 - Production/Stable',
- 'License :: OSI Approved :: MIT License',
- 'Programming Language :: Python :: 3.6',
- 'Programming Language :: Python :: 3.7',
- 'Programming Language :: Python :: 3.8',
- 'Natural Language :: English',
- 'Topic :: Education',
+ "Intended Audience :: Developers",
+ "Development Status :: 5 - Production/Stable",
+ "License :: OSI Approved :: MIT License",
+ "Programming Language :: Python :: 3.6",
+ "Programming Language :: Python :: 3.7",
+ "Programming Language :: Python :: 3.8",
+ "Natural Language :: English",
+ "Topic :: Education",
],
- test_suite='tests',
+ test_suite="tests",
)
diff --git a/tests/conftest.py b/tests/conftest.py
index 57d5d6e4..8e1f65ed 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,57 +1,86 @@
import copy
import glob
import os
-import pytest
-import requests
import uuid
import zipfile
-from le_utils.constants import licenses, content_kinds, exercises, roles
-from ricecooker.__init__ import __version__
-from ricecooker.classes.files import AudioFile, DocumentFile, EPubFile, HTMLZipFile, ThumbnailFile, SlideImageFile, SubtitleFile, VideoFile
-from ricecooker.classes.files import _ExerciseImageFile, _ExerciseBase64ImageFile, _ExerciseGraphieFile
-from ricecooker.classes.nodes import AudioNode, ChannelNode, DocumentNode, ExerciseNode, HTML5AppNode, SlideshowNode, TopicNode, VideoNode
-from ricecooker.classes.questions import InputQuestion, SingleSelectQuestion
+import pytest
+import requests
+from le_utils.constants import content_kinds
+from le_utils.constants import exercises
+from le_utils.constants import licenses
+from le_utils.constants import roles
+from ricecooker.__init__ import __version__
+from ricecooker.classes.files import _ExerciseBase64ImageFile
+from ricecooker.classes.files import _ExerciseGraphieFile
+from ricecooker.classes.files import _ExerciseImageFile
+from ricecooker.classes.files import AudioFile
+from ricecooker.classes.files import DocumentFile
+from ricecooker.classes.files import EPubFile
+from ricecooker.classes.files import HTMLZipFile
+from ricecooker.classes.files import SlideImageFile
+from ricecooker.classes.files import SubtitleFile
+from ricecooker.classes.files import ThumbnailFile
+from ricecooker.classes.files import VideoFile
+from ricecooker.classes.nodes import AudioNode
+from ricecooker.classes.nodes import ChannelNode
+from ricecooker.classes.nodes import DocumentNode
+from ricecooker.classes.nodes import ExerciseNode
+from ricecooker.classes.nodes import HTML5AppNode
+from ricecooker.classes.nodes import SlideshowNode
+from ricecooker.classes.nodes import TopicNode
+from ricecooker.classes.nodes import VideoNode
+from ricecooker.classes.questions import InputQuestion
+from ricecooker.classes.questions import SingleSelectQuestion
# GLOBAL TEST SETUP/TEARDOWN UTILS
################################################################################
+
def pytest_sessionfinish(session, exitstatus):
"""
Cleanup testcontent/generated/ directory after each test run is finished.
"""
generated_path = os.path.join("tests", "testcontent", "generated")
- for path in glob.glob(generated_path + os.path.sep + '*'):
+ for path in glob.glob(generated_path + os.path.sep + "*"):
os.remove(path)
# CHANNEL FIXTURES
################################################################################
+
@pytest.fixture
def domain_namespace():
return "testing.learningequality.org"
+
@pytest.fixture
def channel_source_id():
return "channel-id"
+
@pytest.fixture
def channel_domain_namespace(domain_namespace):
return uuid.uuid5(uuid.NAMESPACE_DNS, domain_namespace)
+
@pytest.fixture
def channel_node_id(channel_domain_namespace, channel_source_id):
return uuid.uuid5(channel_domain_namespace, channel_source_id)
+
@pytest.fixture
def channel_content_id(channel_domain_namespace, channel_node_id):
return uuid.uuid5(channel_domain_namespace, channel_node_id.hex)
+
@pytest.fixture
-def channel_data(channel_node_id, channel_content_id, domain_namespace, channel_source_id):
+def channel_data(
+ channel_node_id, channel_content_id, domain_namespace, channel_source_id
+):
return {
"id": channel_node_id.hex,
"name": "Channel",
@@ -67,34 +96,31 @@ def channel_data(channel_node_id, channel_content_id, domain_namespace, channel_
"extra_fields": "{}",
}
+
@pytest.fixture
def channel(domain_namespace, channel_source_id, channel_data):
channel = ChannelNode(
channel_source_id,
domain_namespace,
- title=channel_data['name'],
- description=channel_data['description'],
- tagline=channel_data['tagline'],
- language=channel_data['language']
+ title=channel_data["name"],
+ description=channel_data["description"],
+ tagline=channel_data["tagline"],
+ language=channel_data["language"],
)
return channel
+
@pytest.fixture
def invalid_channel(channel_source_id, domain_namespace):
- channel = ChannelNode(
- channel_source_id,
- domain_namespace,
- title='Invalid Channel'
- )
+ channel = ChannelNode(channel_source_id, domain_namespace, title="Invalid Channel")
channel.source_id = None
return channel
-
-
# ID, ARGS, AND KWARGS FIXTURE HELPERS
################################################################################
+
@pytest.fixture
def base_data(channel_domain_namespace, title):
"""
@@ -108,15 +134,15 @@ def base_data(channel_domain_namespace, title):
"description": "Description",
"author": "Author",
"source_domain": channel_domain_namespace.hex,
- "files" : [],
+ "files": [],
"tags": [],
"questions": [],
"extra_fields": {}, # dict as input kwarg, but json.dumps-ed in to_dict
"license": None,
"copyright_holder": "",
"license_description": None,
- "aggregator": "", # New in ricecooker 0.6.20
- "provider": "", # New in ricecooker 0.6.20
+ "aggregator": "", # New in ricecooker 0.6.20
+ "provider": "", # New in ricecooker 0.6.20
}
@@ -135,38 +161,38 @@ def genrate_random_ids(channel_domain_namespace, channel_node_id):
return ids_dict
-
-
# TOPIC FIXTURES
################################################################################
+
def get_topic_node_args(node_data):
"""
Returns (source_id, title) from node_data dictionary.
"""
node_data = copy.deepcopy(node_data)
- source_id = node_data.pop('source_id')
- title = node_data.pop('title')
- license = node_data.pop('license')
+ source_id = node_data.pop("source_id")
+ title = node_data.pop("title")
+ license = node_data.pop("license")
return source_id, title
+
def get_topic_node_kwargs_data(node_data):
"""
Returns all keywords data other than source_id, title, and license.
"""
node_data = copy.deepcopy(node_data)
- del node_data['source_id']
- del node_data['title']
+ del node_data["source_id"]
+ del node_data["title"]
# the following attributes will appear in `to_dict` method, but we don't need
# to pass them in when creating a TopicNode
- del node_data['content_id']
- del node_data['node_id']
- del node_data['kind']
- del node_data['source_domain']
- del node_data['questions']
- del node_data['license']
- del node_data['license_description']
- del node_data['copyright_holder']
+ del node_data["content_id"]
+ del node_data["node_id"]
+ del node_data["kind"]
+ del node_data["source_domain"]
+ del node_data["questions"]
+ del node_data["license"]
+ del node_data["license_description"]
+ del node_data["copyright_holder"]
return node_data
@@ -180,9 +206,10 @@ def topic_data(base_data, channel_domain_namespace, channel_node_id):
topic_data = copy.deepcopy(base_data)
ids_dict = genrate_random_ids(channel_domain_namespace, channel_node_id)
topic_data.update(ids_dict)
- topic_data.update({ "kind": content_kinds.TOPIC })
+ topic_data.update({"kind": content_kinds.TOPIC})
return topic_data
+
@pytest.fixture
def topic(channel, title, topic_data):
args_data = get_topic_node_args(topic_data)
@@ -195,16 +222,21 @@ def topic(channel, title, topic_data):
# CONTENT NODE FIXTURES
################################################################################
+
@pytest.fixture
def contentnode_base_data(base_data):
"""
Shared data for all ContentNode fixtures.
"""
data = copy.deepcopy(base_data)
- data.update({ "license": licenses.CC_BY,
- "copyright_holder": "Copyright Holder",
- "license_description": None,
- "role": roles.LEARNER})
+ data.update(
+ {
+ "license": licenses.CC_BY,
+ "copyright_holder": "Copyright Holder",
+ "license_description": None,
+ "role": roles.LEARNER,
+ }
+ )
return data
@@ -213,9 +245,9 @@ def get_content_node_args(node_data):
Returns (source_id, title, license) from node_data dictionary.
"""
node_data = copy.deepcopy(node_data)
- source_id = node_data.pop('source_id')
- title = node_data.pop('title')
- license = node_data.pop('license')
+ source_id = node_data.pop("source_id")
+ title = node_data.pop("title")
+ license = node_data.pop("license")
return source_id, title, license
@@ -224,16 +256,16 @@ def get_content_node_kwargs(node_data):
Returns all keywords data other than source_id, title, and license.
"""
node_data = copy.deepcopy(node_data)
- del node_data['source_id']
- del node_data['title']
- del node_data['license']
+ del node_data["source_id"]
+ del node_data["title"]
+ del node_data["license"]
# below are vars from internal representation
- del node_data['content_id']
- del node_data['node_id']
- del node_data['kind']
- del node_data['source_domain']
- del node_data['questions']
- node_data['extra_fields'] = {}
+ del node_data["content_id"]
+ del node_data["node_id"]
+ del node_data["kind"]
+ del node_data["source_domain"]
+ del node_data["questions"]
+ node_data["extra_fields"] = {}
return node_data
@@ -241,57 +273,63 @@ def get_content_node_kwargs(node_data):
def base_file_path():
return "test/file/path"
+
@pytest.fixture
def contentnode_invalid_license(video):
video = copy.deepcopy(video)
video.license = None
return video
+
@pytest.fixture
def contentnode_invalid_files(video):
video = copy.deepcopy(video)
video.files = []
return video
+
@pytest.fixture
def contentnode_no_source_id(title):
- topic = TopicNode('some source id', title)
+ topic = TopicNode("some source id", title)
topic.source_id = None
return topic
-
-
-
# VIDEO FIXTURES
################################################################################
+
@pytest.fixture
-def video_file(): # uses same file as test_videos.low_res_video fixture
- source_url = "https://archive.org/download/vd_is_for_everybody/vd_is_for_everybody_512kb.mp4"
+def video_file(): # uses same file as test_videos.low_res_video fixture
+ source_url = (
+ "https://archive.org/download/vd_is_for_everybody/vd_is_for_everybody_512kb.mp4"
+ )
local_path = os.path.join("tests", "testcontent", "downloaded", "low_res_video.mp4")
download_fixture_file(source_url, local_path)
assert os.path.exists(local_path)
return VideoFile(local_path)
+
@pytest.fixture
def video_filename():
- return '897d83a2e5389d454d37feb574587516.mp4'
+ return "897d83a2e5389d454d37feb574587516.mp4"
+
@pytest.fixture
def subtitle_file():
local_path = os.path.join("tests", "testcontent", "generated", "testsubtitles.vtt")
if not os.path.exists(local_path):
- with open(local_path, 'wb') as subtitlefile:
- subtitlefile.write(b'WEBVTT\n')
- subtitlefile.write(b'\n')
- subtitlefile.write(b'00:01.000 --> 00:04.250\n')
- subtitlefile.write(b'Testing subtitles\n')
- return SubtitleFile(local_path, language='en')
+ with open(local_path, "wb") as subtitlefile:
+ subtitlefile.write(b"WEBVTT\n")
+ subtitlefile.write(b"\n")
+ subtitlefile.write(b"00:01.000 --> 00:04.250\n")
+ subtitlefile.write(b"Testing subtitles\n")
+ return SubtitleFile(local_path, language="en")
+
@pytest.fixture
def subtitle_filename():
- return '19faefeb0b8b8289923dc0c1c5adb7e5.vtt'
+ return "19faefeb0b8b8289923dc0c1c5adb7e5.vtt"
@pytest.fixture
@@ -299,9 +337,10 @@ def video_data(contentnode_base_data, channel_domain_namespace, channel_node_id)
video_data = copy.deepcopy(contentnode_base_data)
ids_dict = genrate_random_ids(channel_domain_namespace, channel_node_id)
video_data.update(ids_dict)
- video_data.update({ "kind": content_kinds.VIDEO })
+ video_data.update({"kind": content_kinds.VIDEO})
return video_data
+
@pytest.fixture
def video(video_file, video_data, channel):
args_data = get_content_node_args(video_data)
@@ -309,26 +348,29 @@ def video(video_file, video_data, channel):
video = VideoNode(*args_data, **contentnode_kwargs)
video.add_file(video_file)
channel.add_child(video)
- video_data['files'].append(video_file) # save it so we can compare later
+ video_data["files"].append(video_file) # save it so we can compare later
return video
+
@pytest.fixture
def video_invalid_files(video_data, document_file):
args_data = get_content_node_args(video_data)
contentnode_kwargs = get_content_node_kwargs(video_data)
- contentnode_kwargs['files'] = [] # clear files becuse added one above
+ contentnode_kwargs["files"] = [] # clear files becuse added one above
video = VideoNode(*args_data, **contentnode_kwargs)
video.add_file(document_file)
return video
+
@pytest.fixture
def invalid_video_file():
local_path = os.path.join("tests", "testcontent", "generated", "invalid_video.mp4")
if not os.path.exists(local_path):
- with open(local_path, 'wb') as f:
- f.write(b'this is an invalid video file')
+ with open(local_path, "wb") as f:
+ f.write(b"this is an invalid video file")
return DocumentFile(local_path)
+
@pytest.fixture
def youtube_video_dict():
"""
@@ -336,6 +378,7 @@ def youtube_video_dict():
"""
return {"youtube_id": "C0DPdy98e4c"}
+
@pytest.fixture
def youtube_video_with_subs_dict():
"""
@@ -343,34 +386,53 @@ def youtube_video_with_subs_dict():
"""
return {
"youtube_id": "USq6DX7byoY",
- "subtitles_langs": ["nl", "en", "en-GB", "fr", "el", "hu", "it", "pt", "ro", "es"]
+ "subtitles_langs": [
+ "nl",
+ "en",
+ "en-GB",
+ "fr",
+ "el",
+ "hu",
+ "it",
+ "pt",
+ "ro",
+ "es",
+ ],
}
+
# AUDIO FIXTURES
################################################################################
+
@pytest.fixture
def audio_file():
- source_url = "https://ia800103.us.archive.org/9/items/cd_prince_prince/" \
- "disc1/02.%20Prince%20-%201999%20%28Edit%29_sample.mp3"
+ source_url = (
+ "https://ia800103.us.archive.org/9/items/cd_prince_prince/"
+ "disc1/02.%20Prince%20-%201999%20%28Edit%29_sample.mp3"
+ )
local_path = os.path.join("tests", "testcontent", "downloaded", "testaudio.mp3")
download_fixture_file(source_url, local_path)
assert os.path.exists(local_path)
return AudioFile(local_path)
+
@pytest.fixture
def audio_filename():
- return 'c335e8044ecf583c690d5d8c65d68627.mp3'
+ return "c335e8044ecf583c690d5d8c65d68627.mp3"
@pytest.fixture
-def audio_data(contentnode_base_data, audio_file, channel_domain_namespace, channel_node_id):
+def audio_data(
+ contentnode_base_data, audio_file, channel_domain_namespace, channel_node_id
+):
audio_data = copy.deepcopy(contentnode_base_data)
ids_dict = genrate_random_ids(channel_domain_namespace, channel_node_id)
audio_data.update(ids_dict)
- audio_data.update({ "kind": content_kinds.AUDIO })
+ audio_data.update({"kind": content_kinds.AUDIO})
return audio_data
+
@pytest.fixture
def audio(audio_file, audio_data, channel):
args_data = get_content_node_args(audio_data)
@@ -378,29 +440,33 @@ def audio(audio_file, audio_data, channel):
audio = AudioNode(*args_data, **contentnode_kwargs)
audio.add_file(audio_file)
channel.add_child(audio)
- audio_data['files'].append(audio_file) # save it so we can compare later
+ audio_data["files"].append(audio_file) # save it so we can compare later
return audio
+
@pytest.fixture
def audio_invalid_files(audio_data, document_file):
args_data = get_content_node_args(audio_data)
contentnode_kwargs = get_content_node_kwargs(audio_data)
- contentnode_kwargs['files'] = [] # clear files because added one above
+ contentnode_kwargs["files"] = [] # clear files because added one above
audio = AudioNode(*args_data, **contentnode_kwargs)
audio.add_file(document_file)
return audio
+
@pytest.fixture
def invalid_audio_file():
local_path = os.path.join("tests", "testcontent", "generated", "invalid_audio.mp3")
if not os.path.exists(local_path):
- with open(local_path, 'wb') as f:
- f.write(b'invalid MP3')
+ with open(local_path, "wb") as f:
+ f.write(b"invalid MP3")
return DocumentFile(local_path)
+
# DOCUMENT FIXTURES
################################################################################
+
@pytest.fixture
def document_file():
source_url = "https://ia802506.us.archive.org/8/items/generalmanual_000075878/generalmanual_000075878.pdf"
@@ -409,18 +475,23 @@ def document_file():
assert os.path.exists(local_path)
return DocumentFile(local_path)
+
@pytest.fixture
def document_filename():
- return 'b976c31a7ab68a97f12541d661245238.pdf'
+ return "b976c31a7ab68a97f12541d661245238.pdf"
+
@pytest.fixture
-def document_data(contentnode_base_data, document_file, channel_domain_namespace, channel_node_id):
+def document_data(
+ contentnode_base_data, document_file, channel_domain_namespace, channel_node_id
+):
document_data = copy.deepcopy(contentnode_base_data)
ids_dict = genrate_random_ids(channel_domain_namespace, channel_node_id)
document_data.update(ids_dict)
- document_data.update({ "kind": content_kinds.DOCUMENT })
+ document_data.update({"kind": content_kinds.DOCUMENT})
return document_data
+
@pytest.fixture
def document(document_file, document_data, channel):
args_data = get_content_node_args(document_data)
@@ -428,14 +499,15 @@ def document(document_file, document_data, channel):
document = DocumentNode(*args_data, **contentnode_kwargs)
document.add_file(document_file)
channel.add_child(document)
- document_data['files'].append(document_file) # save it so we can compare later
+ document_data["files"].append(document_file) # save it so we can compare later
return document
+
@pytest.fixture
def document_invalid_files(document_data, audio_file):
args_data = get_content_node_args(document_data)
contentnode_kwargs = get_content_node_kwargs(document_data)
- contentnode_kwargs['files'] = [] # clear files becuse added one above
+ contentnode_kwargs["files"] = [] # clear files becuse added one above
document = DocumentNode(*args_data, **contentnode_kwargs)
document.add_file(audio_file)
return document
@@ -447,53 +519,66 @@ def epub_file():
assert os.path.exists(path)
return EPubFile(path)
+
@pytest.fixture
def epub_filename():
- return '5f91b55a7648206343b609cae692e08c.epub'
+ return "5f91b55a7648206343b609cae692e08c.epub"
@pytest.fixture
def invalid_document_file():
- local_path = os.path.join("tests", "testcontent", "generated", "invalid_document.pdf")
+ local_path = os.path.join(
+ "tests", "testcontent", "generated", "invalid_document.pdf"
+ )
if not os.path.exists(local_path):
- with open(local_path, 'wb') as f:
- f.write(b'invalid PDF')
+ with open(local_path, "wb") as f:
+ f.write(b"invalid PDF")
return DocumentFile(local_path)
+
@pytest.fixture
def invalid_epub_file():
- local_path = os.path.join("tests", "testcontent", "generated", "invalid_document.epub")
+ local_path = os.path.join(
+ "tests", "testcontent", "generated", "invalid_document.epub"
+ )
if not os.path.exists(local_path):
- with open(local_path, 'wb') as f:
- f.write(b'invalid ePub')
+ with open(local_path, "wb") as f:
+ f.write(b"invalid ePub")
return EPubFile(local_path)
# HTML FIXTURES
################################################################################
+
@pytest.fixture
def html_file():
- source_url = "https://studio.learningequality.org/content/storage/" \
- "e/d/ed494d6547b603b8ff22095cf5f5b624.zip"
+ source_url = (
+ "https://studio.learningequality.org/content/storage/"
+ "e/d/ed494d6547b603b8ff22095cf5f5b624.zip"
+ )
local_path = os.path.join("tests", "testcontent", "downloaded", "testhtml.zip")
download_fixture_file(source_url, local_path)
assert os.path.exists(local_path)
return HTMLZipFile(local_path)
+
@pytest.fixture
def html_filename():
- return 'ed494d6547b603b8ff22095cf5f5b624.zip'
+ return "ed494d6547b603b8ff22095cf5f5b624.zip"
@pytest.fixture
-def html_data(contentnode_base_data, html_file, channel_domain_namespace, channel_node_id):
+def html_data(
+ contentnode_base_data, html_file, channel_domain_namespace, channel_node_id
+):
html_data = copy.deepcopy(contentnode_base_data)
ids_dict = genrate_random_ids(channel_domain_namespace, channel_node_id)
html_data.update(ids_dict)
- html_data.update({ "kind": content_kinds.HTML5 })
+ html_data.update({"kind": content_kinds.HTML5})
return html_data
+
@pytest.fixture
def html(html_file, html_data, channel):
args_data = get_content_node_args(html_data)
@@ -501,7 +586,7 @@ def html(html_file, html_data, channel):
html = HTML5AppNode(*args_data, **contentnode_kwargs)
html.add_file(html_file)
channel.add_child(html)
- html_data['files'].append(html_file) # save it so we can compare later
+ html_data["files"].append(html_file) # save it so we can compare later
return html
@@ -512,7 +597,7 @@ def html_invalid_files(html_data, document_file):
"""
args_data = get_content_node_args(html_data)
contentnode_kwargs = get_content_node_kwargs(html_data)
- contentnode_kwargs['files'] = [] # clear files becuse added one above
+ contentnode_kwargs["files"] = [] # clear files becuse added one above
html = HTML5AppNode(*args_data, **contentnode_kwargs)
html.add_file(document_file)
return html
@@ -520,58 +605,75 @@ def html_invalid_files(html_data, document_file):
@pytest.fixture
def html_invalid_file():
- local_path = os.path.join("tests", "testcontent", "generated", "testinvalidhtml.zip")
+ local_path = os.path.join(
+ "tests", "testcontent", "generated", "testinvalidhtml.zip"
+ )
if not os.path.exists(local_path):
- with zipfile.ZipFile(local_path, 'w', zipfile.ZIP_DEFLATED) as archive:
- archive.writestr("notindex.html", '')
+ with zipfile.ZipFile(local_path, "w", zipfile.ZIP_DEFLATED) as archive:
+ archive.writestr("notindex.html", "")
return HTMLZipFile(local_path)
+
@pytest.fixture
def html_invalid_zip(html_data, html_invalid_file):
args_data = get_content_node_args(html_data)
contentnode_kwargs = get_content_node_kwargs(html_data)
- contentnode_kwargs['files'] = [] # clear files because added one above
+ contentnode_kwargs["files"] = [] # clear files because added one above
html = HTML5AppNode(*args_data, **contentnode_kwargs)
html.add_file(html_invalid_file)
return html
-
# EXERCISE FIXTURES
################################################################################
+
@pytest.fixture
def exercise_question():
return SingleSelectQuestion("question_1", "Question", "Answer", ["Answer"])
+
@pytest.fixture
def mastery_model():
- return {'mastery_model': exercises.M_OF_N, 'randomize': True, 'm': 1, 'n': 1}
+ return {"mastery_model": exercises.M_OF_N, "randomize": True, "m": 1, "n": 1}
+
@pytest.fixture
-def exercise_data(contentnode_base_data, mastery_model, exercise_question, channel_domain_namespace, channel_node_id):
+def exercise_data(
+ contentnode_base_data,
+ mastery_model,
+ exercise_question,
+ channel_domain_namespace,
+ channel_node_id,
+):
exercise_data = copy.deepcopy(contentnode_base_data)
ids_dict = genrate_random_ids(channel_domain_namespace, channel_node_id)
exercise_data.update(ids_dict)
- exercise_data.update({ "kind": content_kinds.EXERCISE,
- "questions":[],
- "exercise_data": mastery_model})
+ exercise_data.update(
+ {
+ "kind": content_kinds.EXERCISE,
+ "questions": [],
+ "exercise_data": mastery_model,
+ }
+ )
return exercise_data
+
@pytest.fixture
def exercise(exercise_question, exercise_data, channel):
args_data = get_content_node_args(exercise_data)
contentnode_kwargs = get_content_node_kwargs(exercise_data)
- del contentnode_kwargs['extra_fields']
- mastery_model_dict = contentnode_kwargs['exercise_data']
+ del contentnode_kwargs["extra_fields"]
+ mastery_model_dict = contentnode_kwargs["exercise_data"]
exercise = ExerciseNode(*args_data, **contentnode_kwargs)
exercise.add_question(exercise_question)
channel.add_child(exercise)
- exercise_data['questions'] = [exercise_question]
- exercise_data['extra_fields'] = mastery_model_dict
- del exercise_data['exercise_data']
+ exercise_data["questions"] = [exercise_question]
+ exercise_data["extra_fields"] = mastery_model_dict
+ del exercise_data["exercise_data"]
return exercise
+
@pytest.fixture
def exercise_invalid_question(exercise):
exercise = copy.deepcopy(exercise)
@@ -579,122 +681,131 @@ def exercise_invalid_question(exercise):
return exercise
-
# THUMBNAIL FILE FIXTURES
################################################################################
+
@pytest.fixture
def thumbnail_file():
local_path = os.path.join("tests", "testcontent", "samples", "thumbnail.png")
assert os.path.exists(local_path)
return ThumbnailFile(local_path)
+
@pytest.fixture
def thumbnail_filename():
- return 'eb79354ddd5774bb3436f9a19c282bff.png'
+ return "eb79354ddd5774bb3436f9a19c282bff.png"
+
@pytest.fixture
def fake_thumbnail_file():
local_path = os.path.join("tests", "testcontent", "generated", "invalidimage.png")
if not os.path.exists(local_path):
- with open(local_path, 'wb') as imgfile:
- imgfile.write(b'not_a_valid_PNG')
+ with open(local_path, "wb") as imgfile:
+ imgfile.write(b"not_a_valid_PNG")
return ThumbnailFile(local_path)
-
# EXERCISE IMAGES FIXTURES
################################################################################
+
@pytest.fixture
def exercise_image_file():
- return _ExerciseImageFile('tests/testcontent/exercises/no-wifi.png')
+ return _ExerciseImageFile("tests/testcontent/exercises/no-wifi.png")
+
@pytest.fixture
def exercise_image_filename():
- return '599aa896313be22dea6c0257772a464e.png'
+ return "599aa896313be22dea6c0257772a464e.png"
@pytest.fixture
def exercise_base64_image_file():
- with open('tests/testcontent/exercises/test_image_base64.data') as datafile:
+ with open("tests/testcontent/exercises/test_image_base64.data") as datafile:
base64_data = datafile.read()
return _ExerciseBase64ImageFile(base64_data)
+
@pytest.fixture
def exercise_base64_image_filename():
- return 'cd9635def904486701e7705ef29ece67.png'
+ return "cd9635def904486701e7705ef29ece67.png"
@pytest.fixture
def exercise_graphie_file():
- return _ExerciseGraphieFile('tests/testcontent/exercises/eb3f3bf7c317408ee90995b5bcf4f3a59606aedd')
+ return _ExerciseGraphieFile(
+ "tests/testcontent/exercises/eb3f3bf7c317408ee90995b5bcf4f3a59606aedd"
+ )
+
@pytest.fixture
def exercise_graphie_replacement_str():
- return 'eb3f3bf7c317408ee90995b5bcf4f3a59606aedd'
+ return "eb3f3bf7c317408ee90995b5bcf4f3a59606aedd"
+
@pytest.fixture
def exercise_graphie_filename():
- return 'ea2269bb5cf487f8d883144b9c06fbc7.graphie'
-
-
+ return "ea2269bb5cf487f8d883144b9c06fbc7.graphie"
# SLIDESHOW IMAGES FIXTURES
################################################################################
+
@pytest.fixture
def slideshow_files():
fake_files = []
- for i in range(0,10):
- filename = 'tests/testcontent/generated/slide' + str(i) + '.jpg'
+ for i in range(0, 10):
+ filename = "tests/testcontent/generated/slide" + str(i) + ".jpg"
if not os.path.exists(filename):
- with open(filename, 'w') as f:
- f.write('jpgdatawouldgohere' + str(i))
- fake_files.append(
- SlideImageFile(filename, caption='slide ' + str(i))
- )
+ with open(filename, "w") as f:
+ f.write("jpgdatawouldgohere" + str(i))
+ fake_files.append(SlideImageFile(filename, caption="slide " + str(i)))
return fake_files
+
@pytest.fixture
-def slideshow_data(contentnode_base_data, slideshow_files, channel_domain_namespace, channel_node_id):
+def slideshow_data(
+ contentnode_base_data, slideshow_files, channel_domain_namespace, channel_node_id
+):
slideshow_data = copy.deepcopy(contentnode_base_data)
ids_dict = genrate_random_ids(channel_domain_namespace, channel_node_id)
slideshow_data.update(ids_dict)
- slideshow_data.update({ "kind": content_kinds.SLIDESHOW })
+ slideshow_data.update({"kind": content_kinds.SLIDESHOW})
# TODO setup expected extra_fields['slideshow_data']
return slideshow_data
+
@pytest.fixture
def slideshow(slideshow_files, slideshow_data, channel):
args_data = get_content_node_args(slideshow_data)
contentnode_kwargs = get_content_node_kwargs(slideshow_data)
- del contentnode_kwargs['extra_fields']
+ del contentnode_kwargs["extra_fields"]
slideshow = SlideshowNode(*args_data, **contentnode_kwargs)
for slideshow_file in slideshow_files:
slideshow.add_file(slideshow_file)
channel.add_child(slideshow)
- slideshow_data['files'] = slideshow_files # save it so we can compare later
+ slideshow_data["files"] = slideshow_files # save it so we can compare later
return slideshow
-
# FIXTURE DOWNLOADING UTILS
################################################################################
+
def download_fixture_file(source_url, local_path):
"""
Download fixture file `source_url` to `local_path` if not present already.
"""
if os.path.exists(local_path):
return
- with open(local_path, 'wb') as f:
+ with open(local_path, "wb") as f:
response = requests.get(source_url, stream=True)
- assert response.status_code == 200, "Fixture file with url: {} not found".format(source_url)
+ assert (
+ response.status_code == 200
+ ), "Fixture file with url: {} not found".format(source_url)
for chunk in response.iter_content(chunk_size=1048576):
f.write(chunk)
f.flush()
f.close()
-
-
diff --git a/tests/media_utils/README.md b/tests/media_utils/README.md
index 4c5d861d..df8c8280 100644
--- a/tests/media_utils/README.md
+++ b/tests/media_utils/README.md
@@ -26,15 +26,15 @@ Various media processing functions and utilities - vendored from the previously
## Converting caption files
This contains utilities for converting caption files from a few various
formats into the preferred `VTT` format. The currently supported formats include:
-- [DFXP](https://en.wikipedia.org/wiki/Timed_Text_Markup_Language)
+- [DFXP](https://en.wikipedia.org/wiki/Timed_Text_Markup_Language)
- [SAMI](https://en.wikipedia.org/wiki/SAMI)
- [SCC](http://www.theneitherworld.com/mcpoodle/SCC_TOOLS/DOCS/SCC_FORMAT.HTML)
-- [SRT](https://en.wikipedia.org/wiki/SubRip)
+- [SRT](https://en.wikipedia.org/wiki/SubRip)
- [TTML](https://en.wikipedia.org/wiki/Timed_Text_Markup_Language)
- [WebVTT or just VTT](https://en.wikipedia.org/wiki/WebVTT)
-> Within `ricecooker`, the term "captions" and "subtitles" are used interchangeably. All of the
-classes and functions handling conversion use the "subtitles" term.
+> Within `ricecooker`, the term "captions" and "subtitles" are used interchangeably. All of the
+classes and functions handling conversion use the "subtitles" term.
### Language codes
@@ -98,7 +98,7 @@ converter = build_subtitle_converter_from_file('/path/to/file')
# Replace unknown language code if present
if converter.has_language(LANGUAGE_CODE_UNKNOWN):
converter.replace_unknown_language('en')
-
+
assert converter.has_language('en'), 'Must have English after replace'
output_str = converter.convert('en')
@@ -119,6 +119,3 @@ for lang_code in converter.get_language_codes():
elif lang_code == LANGUAGE_CODE_UNKNOWN:
raise InvalidSubtitleLanguageError('Unexpected unknown language')
```
-
-
-
diff --git a/tests/media_utils/files/assets/images/copyright.txt b/tests/media_utils/files/assets/images/copyright.txt
index fb3f2a64..3ac9a66b 100644
--- a/tests/media_utils/files/assets/images/copyright.txt
+++ b/tests/media_utils/files/assets/images/copyright.txt
@@ -1,4 +1,3 @@
File: 4933759886_098e9acf93_m.jpg
Source: https://flic.kr/p/8vYNVC
License: CC BY 2.0
-
diff --git a/tests/media_utils/files/page_with_links.html b/tests/media_utils/files/page_with_links.html
index 4cd05206..cb728a66 100644
--- a/tests/media_utils/files/page_with_links.html
+++ b/tests/media_utils/files/page_with_links.html
@@ -26,4 +26,4 @@ jQuery for ubernerds, chapter 1.