From 1d3f58730e6f49726ffd0a2747f54857e84c9fd5 Mon Sep 17 00:00:00 2001 From: Sal Hagen Date: Mon, 8 Apr 2024 12:46:31 +0200 Subject: [PATCH 001/204] First commit :) --- webtool/templates/explorer/header.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webtool/templates/explorer/header.html b/webtool/templates/explorer/header.html index f700a10a4..957090eac 100644 --- a/webtool/templates/explorer/header.html +++ b/webtool/templates/explorer/header.html @@ -4,7 +4,7 @@

- 4CAT Explorer (beta){% if parameters and parameters.get("label") %} • {{ parameters.get("label") }}{% elif thread %} • {{ thread }}{% endif %} + 4CAT Explorer {% if parameters and parameters.get("label") %} • {{ parameters.get("label") }}{% elif thread %} • {{ thread }}{% endif %}

{{ key }} From c4a46069393f71c4eac8e19c1b6f32610eab3388 Mon Sep 17 00:00:00 2001 From: sal-phd-desktop Date: Mon, 8 Apr 2024 17:09:14 +0200 Subject: [PATCH 002/204] Use regular `iterate_items` method when looping through dataset + minor changes --- webtool/__init__.py | 2 +- webtool/static/js/explorer.js | 4 +- webtool/templates/explorer/header.html | 2 +- webtool/templates/explorer/nav-pages.html | 2 +- .../{api_explorer.py => views_explorer.py} | 77 ++++++++++--------- 5 files changed, 46 insertions(+), 41 deletions(-) rename webtool/views/{api_explorer.py => views_explorer.py} (91%) diff --git a/webtool/__init__.py b/webtool/__init__.py index 8a1e38a5b..766fc6509 100644 --- a/webtool/__init__.py +++ b/webtool/__init__.py @@ -106,8 +106,8 @@ import webtool.views.views_dataset import webtool.views.views_misc +import webtool.views.views_explorer -import webtool.views.api_explorer import webtool.views.api_standalone import webtool.views.api_tool diff --git a/webtool/static/js/explorer.js b/webtool/static/js/explorer.js index c421cc001..1ee2acc9a 100644 --- a/webtool/static/js/explorer.js +++ b/webtool/static/js/explorer.js @@ -596,14 +596,14 @@ const annotations = { data: json_annotations, success: function (response) { - // If the query is rejected by the server. + // If the query is accepted by the server. if (response == 'success') { $("#annotations-editor-container").hide(); $("#save-annotation-fields").addClass("invalid") $("#save-annotation-fields").prop("disabled", true); } - // If the query is accepted by the server. + // If the query is rejected by the server. else { annotations.warnEditor("Couldn't save annotation fields"); } diff --git a/webtool/templates/explorer/header.html b/webtool/templates/explorer/header.html index 957090eac..ab3472fa7 100644 --- a/webtool/templates/explorer/header.html +++ b/webtool/templates/explorer/header.html @@ -15,7 +15,7 @@

Large dataset - only showing the first {{ max_posts }} posts. Use filter processors to limit the dataset.

{% set post_count = max_posts %} {% endif %} -

Showing posts {{ offset + 1 }} - {{ post_count if (offset + limit) > post_count else (offset + limit) }} ({{ post_count }} in total).

+

Showing posts {{ offset + 1 }} - {{ post_count if (offset + posts_per_page) > post_count else (offset + posts_per_page) }} ({{ post_count }} in total).

{% if custom_fields and custom_fields[0] == "invalid" %}

Invalid custom fields JSON - can't show posts properly ({{ custom_fields[1] }}).

{% endif %} diff --git a/webtool/templates/explorer/nav-pages.html b/webtool/templates/explorer/nav-pages.html index 23fe84f8d..b212f7d65 100644 --- a/webtool/templates/explorer/nav-pages.html +++ b/webtool/templates/explorer/nav-pages.html @@ -2,7 +2,7 @@ {% if post_count > max_posts %} {% set post_count = max_posts %} {% endif %} - {% set pages = ((post_count / limit) + (post_count % limit > 0))|int %} + {% set pages = ((post_count / posts_per_page) + (post_count % posts_per_page > 0))|int %} {% set selected = "selected" %} {% set lower_bound = 3 %} {% set upper_bound = pages - 2 %} diff --git a/webtool/views/api_explorer.py b/webtool/views/views_explorer.py similarity index 91% rename from webtool/views/api_explorer.py rename to webtool/views/views_explorer.py index 5131c1ca8..d07ab5273 100644 --- a/webtool/views/api_explorer.py +++ b/webtool/views/views_explorer.py @@ -1,5 +1,6 @@ """ -4CAT Data API - endpoints to get post and thread data from +4CAT Explorer views - pages that display datasets akin to +the 'native' appearance of the platform they were retrieved from. """ import datetime @@ -35,7 +36,7 @@ @openapi.endpoint("explorer") def explorer_dataset(key, page): """ - Show posts from a specific dataset + Show posts from a dataset :param str dataset_key: Dataset key @@ -54,20 +55,20 @@ def explorer_dataset(key, page): return error(403, error="This dataset is private.") if len(dataset.get_genealogy()) > 1: - return error(404, error="Exporer only available for top-level datasets") + return error(404, error="Unavailable for top-level datasets") results_path = dataset.check_dataset_finished() if not results_path: - return error(404, error="This dataset didn't finish executing (yet)") + return error(404, error="This dataset didn't finish executing") # The amount of posts to show on a page - limit = config.get("explorer.posts_per_page", 50) + posts_per_page = config.get("explorer.posts_per_page", 50) # The amount of posts that may be included (limit for large datasets) max_posts = config.get('explorer.max_posts', 500000) # The offset for posts depending on the current page - offset = ((page - 1) * limit) if page else 0 + offset = ((page - 1) * posts_per_page) if page else 0 # Load some variables parameters = dataset.get_parameters() @@ -83,7 +84,7 @@ def explorer_dataset(key, page): if datasource in list(all_modules.datasources.keys()): is_local = True if all_modules.datasources[datasource].get("is_local") else False - # Check if we have to sort the data in a specific way. + # Check if we have to sort the data. sort_by = request.args.get("sort") if sort_by == "dataset-order": sort_by = None @@ -107,27 +108,25 @@ def explorer_dataset(key, page): posts = [] count = 0 - first_post = False - - for post in iterate_items(results_path, max_rows=max_posts, sort_by=sort_by, descending=descending, force_int=force_int): + try: + for row in dataset.iterate_items(warn_unmappable=False): - count += 1 + count += 1 - # Use an offset if we're showing a page beyond the first. - if count <= offset: - continue + # Use an offset if we're showing a page beyond the first. + if count <= offset: + continue - # Attribute column names and collect dataset's posts. - post_ids.append(post["id"]) - posts.append(post) + # Attribute column names and collect dataset's posts. + post_ids.append(row["id"]) + posts.append(row) - if "link_id" in post: - if post["link_id"][2] == "_": - post["link_id"] = post["link_id"][3:] + # Stop if we exceed the allowed posts per page or max. posts. + if count >= (offset + posts_per_page) or count > max_posts: + break - # Stop if we exceed the max posts per page. - if count >= (offset + limit) or count > max_posts: - break + except NotImplementedError: + return error(404) # Include custom css if it exists in the datasource's 'explorer' dir. # The file's naming format should e.g. be 'reddit-explorer.css'. @@ -160,16 +159,17 @@ def explorer_dataset(key, page): annotations = json.loads(annotations["annotations"]) # Generate the HTML page - return render_template("explorer/explorer.html", key=key, datasource=datasource, board=board, is_local=is_local, parameters=parameters, annotation_fields=annotation_fields, annotations=annotations, posts=posts, custom_css=css, custom_fields=custom_fields, page=page, offset=offset, limit=limit, post_count=post_count, max_posts=max_posts) + return render_template("explorer/explorer.html", key=key, datasource=datasource, board=board, is_local=is_local, parameters=parameters, annotation_fields=annotation_fields, annotations=annotations, posts=posts, custom_css=css, custom_fields=custom_fields, page=page, offset=offset, posts_per_page=posts_per_page, post_count=post_count, max_posts=max_posts) @app.route('/explorer/thread///') @api_ratelimit @login_required @setting_required("privileges.can_use_explorer") @openapi.endpoint("explorer") -def explorer_thread(datasource, board, thread_id): +def explorer_local_thread(datasource, board, thread_id): """ - Show a thread in the explorer + Show a thread. This is only available for local data sources, + and will be depracated/changed in future updates. :param str datasource: Data source ID :param str board: Board name @@ -191,7 +191,7 @@ def explorer_thread(datasource, board, thread_id): max_posts = config.get('explorer.max_posts', 500000) # Get the posts with this thread ID. - posts = get_posts(db, datasource, board=board, ids=tuple([thread_id]), threads=True, order_by=["id"]) + posts = get_local_posts(db, datasource, board=board, ids=tuple([thread_id]), threads=True, order_by=["id"]) if not posts: return error(404, error="No posts available for this thread") @@ -207,16 +207,18 @@ def explorer_thread(datasource, board, thread_id): # The file's naming format should e.g. be 'reddit-explorer.json'. custom_fields = get_custom_fields(datasource) - return render_template("explorer/explorer.html", datasource=datasource, board=board, posts=posts, custom_css=css, custom_fields=custom_fields, limit=len(posts), post_count=len(posts), thread=thread_id, max_posts=max_posts) + return render_template("explorer/explorer.html", datasource=datasource, board=board, posts=posts, custom_css=css, custom_fields=custom_fields, posts_per_page=len(posts), post_count=len(posts), thread=thread_id, max_posts=max_posts) @app.route('/explorer/post///') @api_ratelimit @login_required @setting_required("privileges.can_use_explorer") @openapi.endpoint("explorer") -def explorer_post(datasource, board, thread_id): +def explorer_local_posts(datasource, board, thread_id): """ - Show a thread in the explorer + Show a posts from a local data source. + This is only available for local data sources, + and will be depracated/changed in future updates. :param str datasource: Data source ID :param str board: Board name @@ -235,7 +237,7 @@ def explorer_post(datasource, board, thread_id): return error(404, error="No thread ID provided") # Get the posts with this thread ID. - posts = get_posts(db, datasource, board=board, ids=tuple([thread_id]), threads=True, order_by=["id"]) + posts = get_local_posts(db, datasource, board=board, ids=tuple([thread_id]), threads=True, order_by=["id"]) posts = [strip_html(post) for post in posts] posts = [format(post) for post in posts] @@ -248,7 +250,7 @@ def explorer_post(datasource, board, thread_id): # The file's naming format should e.g. be 'reddit-explorer.json'. custom_fields = get_custom_fields(datasource) - return render_template("explorer/explorer.html", datasource=datasource, board=board, posts=posts, custom_css=css, custom_fields=custom_fields, limit=len(posts), post_count=len(posts)) + return render_template("explorer/explorer.html", datasource=datasource, board=board, posts=posts, custom_css=css, custom_fields=custom_fields, posts_per_page=len(posts), post_count=len(posts)) @app.route("/explorer/save_annotation_fields/", methods=["POST"]) @api_ratelimit @@ -482,7 +484,7 @@ def get_boards(datasource): Get available boards in datasource :param datasource: The datasource for which to acquire the list of available - boards. + boards. :return: A list containing a list of `boards`, as string IDs. :return-schema: {type=object,properties={ @@ -503,7 +505,7 @@ def get_boards(datasource): @app.route('/api/imagefile/') @login_required @setting_required("privileges.can_use_explorer") -def get_image_file(img_file, limit=0): +def get_image_file(img_file): """ Returns an image based on filename Request should hex the md5 hashes first (e.g. with hexdigest()) @@ -518,7 +520,7 @@ def get_image_file(img_file, limit=0): return send_file(str(image_path)) -def iterate_items(in_file, max_rows=None, sort_by=None, descending=False, force_int=False): +def iterate_items_with_sort(in_file, max_rows=None, sort_by=None, descending=False, force_int=False): """ Loop through both csv and NDJSON files. :param in_file, str: The input file to read. @@ -582,7 +584,10 @@ def iterate_items(in_file, max_rows=None, sort_by=None, descending=False, force_ return Exception("Can't loop through file with extension %s" % suffix) -def get_posts(db, datasource, ids, board="", threads=False, limit=0, offset=0, order_by=["timestamp"]): +def get_local_posts(db, datasource, ids, board="", threads=False, limit=0, offset=0, order_by=["timestamp"]): + """ + Retrieve posts from a local data source based on post IDs. + """ if not ids: return None From cac644e79b998590d3af8831a9fb32e671730173 Mon Sep 17 00:00:00 2001 From: sal-phd-desktop Date: Tue, 9 Apr 2024 15:10:14 +0200 Subject: [PATCH 003/204] Change wording in Explorer settings --- common/lib/config_definition.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/common/lib/config_definition.py b/common/lib/config_definition.py index d1c5d6ea9..b1746bb78 100644 --- a/common/lib/config_definition.py +++ b/common/lib/config_definition.py @@ -102,8 +102,8 @@ "privileges.can_use_explorer": { "type": UserInput.OPTION_TOGGLE, "default": True, - "help": "Can use explorer", - "tooltip": "Controls whether users can use the Explorer feature to navigate datasets." + "help": "Can use Explorer", + "tooltip": "Controls whether users can use the Explorer feature to analyse and annotate datasets." }, "privileges.can_export_datasets": { "type": UserInput.OPTION_TOGGLE, @@ -305,13 +305,12 @@ "global": True }, # Explorer settings - # The maximum allowed amount of rows (prevents timeouts and memory errors) "explorer.max_posts": { "type": UserInput.OPTION_TEXT, "default": 100000, "help": "Amount of posts", "coerce_type": int, - "tooltip": "Amount of posts to show in Explorer. The maximum allowed amount of rows (prevents timeouts and " + "tooltip": "Maximum number of posts to be considered by the Explorer (prevents timeouts and " "memory errors)" }, "explorer.posts_per_page": { @@ -319,7 +318,7 @@ "default": 50, "help": "Posts per page", "coerce_type": int, - "tooltip": "Posts to display per page" + "tooltip": "Number of posts to display per page" }, # Web tool settings # These are used by the FlaskConfig class in config.py @@ -515,7 +514,7 @@ "4cat": "4CAT Tool settings", "api": "API credentials", "flask": "Flask settings", - "explorer": "Data Explorer", + "explorer": "Explorer", "datasources": "Data sources", "expire": "Dataset expiration settings", "mail": "Mail settings & credentials", From 8b78452fdeb70398ada668e966fb047f8dd83de5 Mon Sep 17 00:00:00 2001 From: sal-phd-desktop Date: Tue, 9 Apr 2024 15:11:03 +0200 Subject: [PATCH 004/204] Allow Explorer CSS to be inserted and changed in Settings --- datasources/bitchute/search_bitchute.py | 11 ++++ datasources/douban/search_douban.py | 11 ++++ datasources/douyin/search_douyin.py | 12 +++++ datasources/eightchan/search_8chan.py | 10 +++- datasources/eightkun/search_8kun.py | 10 +++- datasources/fourchan/search_4chan.py | 8 +++ datasources/imgur/search_imgur.py | 3 +- datasources/instagram/search_instagram.py | 12 +++++ datasources/linkedin/search_linkedin.py | 12 +++++ datasources/ninegag/search_9gag.py | 1 + datasources/parler/search_parler.py | 12 +++++ datasources/reddit/search_reddit.py | 10 +++- datasources/telegram/search_telegram.py | 8 +++ datasources/tiktok/search_tiktok.py | 12 +++++ datasources/tiktok_urls/search_tiktok_urls.py | 10 +++- .../tumblr/explorer/tumblr-explorer.css | 6 --- datasources/tumblr/search_tumblr.py | 12 ++++- datasources/twitter-import/search_twitter.py | 12 +++++ datasources/twitterv2/search_twitter.py | 10 +++- datasources/vk/search_vk.py | 11 ++++ webtool/views/views_explorer.py | 50 ++++++------------- 21 files changed, 193 insertions(+), 50 deletions(-) diff --git a/datasources/bitchute/search_bitchute.py b/datasources/bitchute/search_bitchute.py index c15540a50..37849891c 100644 --- a/datasources/bitchute/search_bitchute.py +++ b/datasources/bitchute/search_bitchute.py @@ -89,6 +89,17 @@ class SearchBitChute(Search): } + config = { + "explorer.bitchute-search-explorer-css": { + "type": UserInput.OPTION_TEXT_LARGE, + "help": "Bitchute CSS", + "default": "", + "tooltip": "Custom CSS for Bitchute posts in the the Explorer. This allows to " + "mimic the original platform appearance. If empty, use the default " + "CSS template (which is also editable on this page)." + } + } + def get_items(self, query): """ Run custom search diff --git a/datasources/douban/search_douban.py b/datasources/douban/search_douban.py index 0fb983fbe..841bb6037 100644 --- a/datasources/douban/search_douban.py +++ b/datasources/douban/search_douban.py @@ -75,6 +75,17 @@ class SearchDouban(Search): } } + config = { + "explorer.douban-search-explorer-css": { + "type": UserInput.OPTION_TEXT_LARGE, + "help": "TCAT CSS", + "default": "", + "tooltip": "Custom CSS for Douban posts in the the Explorer. This allows to " + "mimic the original platform appearance. If empty, use the default " + "CSS template (which is also editable on this page)." + } + } + def get_items(self, query): """ Get Douban posts diff --git a/datasources/douyin/search_douyin.py b/datasources/douyin/search_douyin.py index bcad19bfb..b53aab2a4 100644 --- a/datasources/douyin/search_douyin.py +++ b/datasources/douyin/search_douyin.py @@ -8,6 +8,7 @@ from backend.lib.search import Search from common.lib.item_mapping import MappedItem +from common.lib.helpers import UserInput class SearchDouyin(Search): """ @@ -27,6 +28,17 @@ class SearchDouyin(Search): "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)" ] + config = { + "explorer.douyin-search-explorer-css": { + "type": UserInput.OPTION_TEXT_LARGE, + "help": "Douyin CSS", + "default": "", + "tooltip": "Custom CSS for Douyin posts in the the Explorer. This allows to " + "mimic the original platform appearance. If empty, use the default " + "CSS template (which is also editable on this page)." + } + } + def get_items(self, query): """ Run custom search diff --git a/datasources/eightchan/search_8chan.py b/datasources/eightchan/search_8chan.py index b3d6702b8..fb8970808 100644 --- a/datasources/eightchan/search_8chan.py +++ b/datasources/eightchan/search_8chan.py @@ -108,5 +108,13 @@ class Search8Chan(Search4Chan): "tooltip": "These boards will not be scraped, but can still be indexed if added to 'Boards to index'", "default": [], "global": True - } + }, + "explorer.eightchan-explorer-css": { + "type": UserInput.OPTION_TEXT_LARGE, + "help": "8chan CSS", + "default": "", + "tooltip": "Custom CSS for 8chan posts in the the Explorer. This allows to " + "mimic the original platform appearance. If empty, use the default " + "CSS template (which is also editable on this page)." + } } diff --git a/datasources/eightkun/search_8kun.py b/datasources/eightkun/search_8kun.py index e54e69d3f..647434f6f 100644 --- a/datasources/eightkun/search_8kun.py +++ b/datasources/eightkun/search_8kun.py @@ -111,5 +111,13 @@ class Search8Kun(Search4Chan): "tooltip": "These boards will not be scraped, but can still be indexed if added to 'Boards to index'", "default": [], "global": True - } + }, + "explorer.eightkun-explorer-css": { + "type": UserInput.OPTION_TEXT_LARGE, + "help": "8kun CSS", + "default": "", + "tooltip": "Custom CSS for 8kun posts in the the Explorer. This allows to " + "mimic the original platform appearance. If empty, use the default " + "CSS template (which is also editable on this page)." + } } \ No newline at end of file diff --git a/datasources/fourchan/search_4chan.py b/datasources/fourchan/search_4chan.py index 17694badc..7e8638caf 100644 --- a/datasources/fourchan/search_4chan.py +++ b/datasources/fourchan/search_4chan.py @@ -443,6 +443,14 @@ class Search4Chan(SearchWithScope): "default": False, "tooltip": "Allows users to query the 4chan data without specifying a keyword. This can lead to HUGE datasets!" }, + "explorer.fourchan-explorer-css": { + "type": UserInput.OPTION_TEXT_LARGE, + "help": "4chan CSS", + "default": "", + "tooltip": "Custom CSS for 4chan posts in the the Explorer. This allows to " + "mimic the original platform appearance. If empty, use the default " + "CSS template (which is also editable on this page)." + } } def get_items_simple(self, query): diff --git a/datasources/imgur/search_imgur.py b/datasources/imgur/search_imgur.py index d3e55c38d..72b04369a 100644 --- a/datasources/imgur/search_imgur.py +++ b/datasources/imgur/search_imgur.py @@ -8,8 +8,9 @@ from backend.lib.search import Search from common.lib.item_mapping import MappedItem +from common.lib.helpers import UserInput -class SearchNineGag(Search): +class SearchImgur(Search): """ Import scraped Imgur data """ diff --git a/datasources/instagram/search_instagram.py b/datasources/instagram/search_instagram.py index fff5a01c8..0118aef36 100644 --- a/datasources/instagram/search_instagram.py +++ b/datasources/instagram/search_instagram.py @@ -10,6 +10,7 @@ from backend.lib.search import Search from common.lib.item_mapping import MappedItem, MissingMappedField from common.lib.exceptions import WorkerInterruptedException, MapItemException +from common.lib.helpers import UserInput class SearchInstagram(Search): @@ -30,6 +31,17 @@ class SearchInstagram(Search): "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok) (also covers usage with Instagram)" ] + config = { + "explorer.instagram-search-explorer-css": { + "type": UserInput.OPTION_TEXT_LARGE, + "help": "Instagram CSS", + "default": "", + "tooltip": "Custom CSS for Instagram posts in the the Explorer. This allows to " + "mimic the original platform appearance. If empty, use the default " + "CSS template (which is also editable on this page)." + } + } + # some magic numbers instagram uses MEDIA_TYPE_PHOTO = 1 MEDIA_TYPE_VIDEO = 2 diff --git a/datasources/linkedin/search_linkedin.py b/datasources/linkedin/search_linkedin.py index d8c0df453..99c2e8efb 100644 --- a/datasources/linkedin/search_linkedin.py +++ b/datasources/linkedin/search_linkedin.py @@ -11,6 +11,7 @@ from backend.lib.search import Search from common.lib.item_mapping import MappedItem +from common.lib.helpers import UserInput class SearchLinkedIn(Search): """ @@ -30,6 +31,17 @@ class SearchLinkedIn(Search): "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok) (also explains general usage of Zeeschuimer)" ] + config = { + "explorer.linkedin-search-explorer-css": { + "type": UserInput.OPTION_TEXT_LARGE, + "help": "LinkedIn CSS", + "default": "", + "tooltip": "Custom CSS for LinkedIn posts in the the Explorer. This allows to " + "mimic the original platform appearance. If empty, use the default " + "CSS template (which is also editable on this page)." + } + } + def get_items(self, query): """ Run custom search diff --git a/datasources/ninegag/search_9gag.py b/datasources/ninegag/search_9gag.py index 973de82ba..4d3768361 100644 --- a/datasources/ninegag/search_9gag.py +++ b/datasources/ninegag/search_9gag.py @@ -8,6 +8,7 @@ from backend.lib.search import Search from common.lib.item_mapping import MappedItem +from common.lib.helpers import UserInput class SearchNineGag(Search): diff --git a/datasources/parler/search_parler.py b/datasources/parler/search_parler.py index 8ccc7ccd8..3ceb95b3a 100644 --- a/datasources/parler/search_parler.py +++ b/datasources/parler/search_parler.py @@ -10,6 +10,7 @@ from backend.lib.search import Search from common.lib.item_mapping import MappedItem +from common.lib.helpers import UserInput class SearchParler(Search): @@ -26,6 +27,17 @@ class SearchParler(Search): # not available as a processor for existing datasets accepts = [None] + config = { + "explorer.parler-search-explorer-css": { + "type": UserInput.OPTION_TEXT_LARGE, + "help": "Parler CSS", + "default": "", + "tooltip": "Custom CSS for Parler posts in the the Explorer. This allows to " + "mimic the original platform appearance. If empty, use the default " + "CSS template (which is also editable on this page)." + } + } + def get_items(self, query): """ Run custom search diff --git a/datasources/reddit/search_reddit.py b/datasources/reddit/search_reddit.py index ead44b142..be21608bb 100644 --- a/datasources/reddit/search_reddit.py +++ b/datasources/reddit/search_reddit.py @@ -115,7 +115,15 @@ class SearchReddit(Search): "help": "Can query without keyword", "default": False, "tooltip": "Allows users to query Pushshift without specifying a keyword. This can lead to HUGE datasets!" - } + }, + "explorer.reddit-search-explorer-css": { + "type": UserInput.OPTION_TEXT_LARGE, + "help": "Reddit CSS", + "default": "", + "tooltip": "Custom CSS for Reddit posts in the the Explorer. This allows to " + "mimic the original platform appearance. If empty, use the default " + "CSS template (which is also editable on this page)." + } } # These change depending on the API type used, diff --git a/datasources/telegram/search_telegram.py b/datasources/telegram/search_telegram.py index e0e9bb142..2b30aa07f 100644 --- a/datasources/telegram/search_telegram.py +++ b/datasources/telegram/search_telegram.py @@ -66,6 +66,14 @@ class SearchTelegram(Search): "default": 25, "tooltip": "Amount of entities that can be queried at a time. Entities are groups or channels. 0 to " "disable limit." + }, + "explorer.telegram-search-explorer-css": { + "type": UserInput.OPTION_TEXT_LARGE, + "help": "Telegram CSS", + "default": "", + "tooltip": "Custom CSS for Telegram posts in the the Explorer. This allows to " + "mimic the original platform appearance. If empty, use the default " + "CSS template (which is also editable on this page)." } } diff --git a/datasources/tiktok/search_tiktok.py b/datasources/tiktok/search_tiktok.py index 90f443b49..29e082769 100644 --- a/datasources/tiktok/search_tiktok.py +++ b/datasources/tiktok/search_tiktok.py @@ -9,6 +9,7 @@ from backend.lib.search import Search from common.lib.item_mapping import MappedItem +from common.lib.helpers import UserInput class SearchTikTok(Search): @@ -29,6 +30,17 @@ class SearchTikTok(Search): "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)" ] + config = { + "explorer.tiktok-search-explorer-css": { + "type": UserInput.OPTION_TEXT_LARGE, + "help": "Tiktok CSS", + "default": "", + "tooltip": "Custom CSS for Tiktok posts in the the Explorer. This allows to " + "mimic the original platform appearance. If empty, use the default " + "CSS template (which is also editable on this page)." + } + } + def get_items(self, query): """ Run custom search diff --git a/datasources/tiktok_urls/search_tiktok_urls.py b/datasources/tiktok_urls/search_tiktok_urls.py index d8864be91..8a61d0f92 100644 --- a/datasources/tiktok_urls/search_tiktok_urls.py +++ b/datasources/tiktok_urls/search_tiktok_urls.py @@ -46,7 +46,15 @@ class SearchTikTokByID(Search): "default": 1.0, "help": "Request wait", "tooltip": "Time to wait before sending a new request from the same IP" - } + }, + "explorer.tiktok-urls-search-explorer-css": { + "type": UserInput.OPTION_TEXT_LARGE, + "help": "Tiktok URLs CSS", + "default": "", + "tooltip": "Custom CSS for Tiktok URLs posts in the the Explorer. This allows to " + "mimic the original platform appearance. If empty, use the default " + "CSS template (which is also editable on this page)." + } } options = { diff --git a/datasources/tumblr/explorer/tumblr-explorer.css b/datasources/tumblr/explorer/tumblr-explorer.css index a7b3df88d..1895e9961 100644 --- a/datasources/tumblr/explorer/tumblr-explorer.css +++ b/datasources/tumblr/explorer/tumblr-explorer.css @@ -1,9 +1,3 @@ -/* - -See https://github.com/digitalmethodsinitiative/4cat/wiki/Exploring-and-annotating-datasets for information on how to add custom CSS. - - */ - body { background-color: #001935; } diff --git a/datasources/tumblr/search_tumblr.py b/datasources/tumblr/search_tumblr.py index 0ce4328dc..07f6a394d 100644 --- a/datasources/tumblr/search_tumblr.py +++ b/datasources/tumblr/search_tumblr.py @@ -73,6 +73,14 @@ class SearchTumblr(Search): 'help': 'Tumblr API Secret Key', 'tooltip': "", }, + "explorer.tumblr-search-explorer-css": { + "type": UserInput.OPTION_TEXT_LARGE, + "help": "CSS Tumblr", + "default": "", + "tooltip": "Custom CSS for Tumblr posts in the the Explorer. This allows to " + "mimic the original platform appearance. If empty, use the default " + "CSS template (which is also editable on this page)." + } } references = ["[Tumblr API documentation](https://www.tumblr.com/docs/en/api/v2)"] @@ -484,9 +492,9 @@ def get_posts_by_blog(self, blog, max_date=None, min_date=None): Get Tumblr posts posts with a certain blog :param tag, str: the name of the blog you want to look for :param min_date: a unix timestamp, indicates posts should be min_date this date. - :param max_date: a unix timestamp, indicates posts should be max_date this date. + :param max_date: a unix timestamp, indicates posts should be max_date this date. - :returns: a dict created from the JSON response + :returns: a dict created from the JSON response """ blog = blog + ".tumblr.com" diff --git a/datasources/twitter-import/search_twitter.py b/datasources/twitter-import/search_twitter.py index 8e8d39e30..b08854e40 100644 --- a/datasources/twitter-import/search_twitter.py +++ b/datasources/twitter-import/search_twitter.py @@ -9,6 +9,7 @@ from backend.lib.search import Search from common.lib.helpers import strip_tags from common.lib.item_mapping import MappedItem +from common.lib.helpers import UserInput class SearchTwitterViaZeeschuimer(Search): @@ -29,6 +30,17 @@ class SearchTwitterViaZeeschuimer(Search): "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)" ] + config = { + "explorer.twitter-import-explorer-css": { + "type": UserInput.OPTION_TEXT_LARGE, + "help": "Twitter import CSS", + "default": "", + "tooltip": "Custom CSS for Twitter import posts in the the Explorer. This allows to " + "mimic the original platform appearance. If empty, use the default " + "CSS template (which is also editable on this page)." + } + } + def get_items(self, query): """ Run custom search diff --git a/datasources/twitterv2/search_twitter.py b/datasources/twitterv2/search_twitter.py index a3dbb4482..fe3069d0b 100644 --- a/datasources/twitterv2/search_twitter.py +++ b/datasources/twitterv2/search_twitter.py @@ -61,7 +61,15 @@ class SearchWithTwitterAPIv2(Search): "tooltip": "If enabled, allow users to enter a list of tweet IDs " "to retrieve. This is disabled by default because it " "can be confusing to novice users." - } + }, + "explorer.twitter-search-explorer-css": { + "type": UserInput.OPTION_TEXT_LARGE, + "help": "Twitter CSS", + "default": "", + "tooltip": "Custom CSS for Twitter posts in the the Explorer. This allows to " + "mimic the original platform appearance. If empty, use the default " + "CSS template (which is also editable on this page)." + } } def get_items(self, query): diff --git a/datasources/vk/search_vk.py b/datasources/vk/search_vk.py index d04daba0a..9efc09a85 100644 --- a/datasources/vk/search_vk.py +++ b/datasources/vk/search_vk.py @@ -31,6 +31,17 @@ class SearchVK(Search): "[Python API wrapper](https://github.com/python273/vk_api)" ] + config = { + "explorer.vk-import-explorer-css": { + "type": UserInput.OPTION_TEXT_LARGE, + "help": "VK import CSS", + "default": "", + "tooltip": "Custom CSS for VK import posts in the the Explorer. This allows to " + "mimic the original platform appearance. If empty, use the default " + "CSS template (which is also editable on this page)." + } + } + expanded_profile_fields = "id,screen_name,first_name,last_name,name,deactivated,is_closed,is_admin,sex,city,country,photo_200,photo_100,photo_50,followers_count,members_count" # https://vk.com/dev/objects/user & https://vk.com/dev/objects/group @classmethod diff --git a/webtool/views/views_explorer.py b/webtool/views/views_explorer.py index d07ab5273..62c90df4e 100644 --- a/webtool/views/views_explorer.py +++ b/webtool/views/views_explorer.py @@ -79,7 +79,7 @@ def explorer_dataset(key, page): # If the dataset is local, we can add some more features # (like the ability to navigate to threads) - is_local = False + is_local = False # CHANGE LATER ///////////////////// if datasource in list(all_modules.datasources.keys()): is_local = True if all_modules.datasources[datasource].get("is_local") else False @@ -128,9 +128,13 @@ def explorer_dataset(key, page): except NotImplementedError: return error(404) - # Include custom css if it exists in the datasource's 'explorer' dir. - # The file's naming format should e.g. be 'reddit-explorer.css'. + # Retrieve custom CSS if it is present in the datasource's config. + # If not given, we use a standard template. This standard CSS template + # can also be changed in the 4CAT control panel under the 'Explorer' + # settings. css = get_custom_css(datasource) + print(datasource) + print("CSS", css) # Include custom fields if it they are in the datasource's 'explorer' dir. # The file's naming format should e.g. be 'reddit-explorer.json'. @@ -609,46 +613,20 @@ def get_local_posts(db, datasource, ids, board="", threads=False, limit=0, offse def get_custom_css(datasource): """ - Check if there's a custom css file for this dataset. - If so, return the text. - Custom css files should be placed in an 'explorer' directory in the the datasource folder and named - '-explorer.css' (e.g. 'reddit/explorer/reddit-explorer.css'). - See https://github.com/digitalmethodsinitiative/4cat/wiki/Exploring-and-annotating-datasets for more information. + Check if there's custom CSS for this data source. + These can be inserted and edited on the Explorer settings page. + If these are absent, we revert to a standard template. :param datasource, str: Datasource name :return: The css as string. """ - # Set the directory name of this datasource. - # Some naming inconsistensies are caught here - if datasource == "twitter": - datasource_dir = "twitter-import" - datasource = "twitter-import" - else: - datasource_dir = datasource - - - css_path = Path(config.get('PATH_ROOT'), "datasources", datasource_dir, "explorer", datasource.lower() + "-explorer.css") - - print(css_path) - read = False - if css_path.exists(): - read = True - else: - # Allow both hypens and underscores in datasource name (to avoid some legacy issues) - css_path = re.sub(datasource, datasource.replace("-", "_"), str(css_path.absolute())) - if Path(css_path).exists(): - read = True - - # Read the css file if it exists - if read: - with open(css_path, "r", encoding="utf-8") as css: - css = css.read() - else: - css = None + custom_css = config.get("explorer." + datasource + "-explorer-css", "") + if not custom_css: + custom_css = config.get("explorer." + datasource + "-search-explorer-css", "") - return css + return custom_css def get_custom_fields(datasource, filetype=None): """ From 0fe3ea64075d59fde8a536bc7194a638a59ba026 Mon Sep 17 00:00:00 2001 From: sal-phd-desktop Date: Tue, 9 Apr 2024 17:33:51 +0200 Subject: [PATCH 005/204] Move around Explorer CSS files --- .../static/css/{explorer.css => explorer/explorer-default.css} | 0 webtool/static/css/explorer/telegram-search.css | 3 +++ 2 files changed, 3 insertions(+) rename webtool/static/css/{explorer.css => explorer/explorer-default.css} (100%) create mode 100644 webtool/static/css/explorer/telegram-search.css diff --git a/webtool/static/css/explorer.css b/webtool/static/css/explorer/explorer-default.css similarity index 100% rename from webtool/static/css/explorer.css rename to webtool/static/css/explorer/explorer-default.css diff --git a/webtool/static/css/explorer/telegram-search.css b/webtool/static/css/explorer/telegram-search.css new file mode 100644 index 000000000..8b6e10cd1 --- /dev/null +++ b/webtool/static/css/explorer/telegram-search.css @@ -0,0 +1,3 @@ +* { + color: gold; +} \ No newline at end of file From e06760aa6438a7191759cf791cad83eca122741a Mon Sep 17 00:00:00 2001 From: sal-phd-desktop Date: Tue, 9 Apr 2024 17:34:18 +0200 Subject: [PATCH 006/204] Edit custom Explorer CSS options --- common/lib/config_definition.py | 17 +++++++++++++++-- datasources/bitchute/search_bitchute.py | 4 +--- datasources/douban/search_douban.py | 6 ++---- datasources/douyin/search_douyin.py | 4 +--- datasources/eightchan/search_8chan.py | 6 ++---- datasources/eightkun/search_8kun.py | 6 ++---- datasources/fourchan/search_4chan.py | 6 ++---- datasources/instagram/search_instagram.py | 4 +--- datasources/reddit/search_reddit.py | 10 +--------- datasources/telegram/search_telegram.py | 12 ++++++++---- webtool/views/views_explorer.py | 2 -- 11 files changed, 35 insertions(+), 42 deletions(-) diff --git a/common/lib/config_definition.py b/common/lib/config_definition.py index b1746bb78..42e42b083 100644 --- a/common/lib/config_definition.py +++ b/common/lib/config_definition.py @@ -305,7 +305,7 @@ "global": True }, # Explorer settings - "explorer.max_posts": { + "explorer.__max_posts": { "type": UserInput.OPTION_TEXT, "default": 100000, "help": "Amount of posts", @@ -313,13 +313,26 @@ "tooltip": "Maximum number of posts to be considered by the Explorer (prevents timeouts and " "memory errors)" }, - "explorer.posts_per_page": { + "explorer.__posts_per_page": { "type": UserInput.OPTION_TEXT, "default": 50, "help": "Posts per page", "coerce_type": int, "tooltip": "Number of posts to display per page" }, + "explorer._explanation_custom_fields": { + "type": UserInput.OPTION_INFO, + "help": "You can customise how posts per data source appear in the Explorer. " + "This involves *custom fields* via a JSON that points to what fields should " + "be displayed. These fields can also be formatted in a specific ways, for " + "instance as a URL or together with specific icons. If this JSON is absent, " + "the Explorer by default shows the `author`, `subject`, `timestamp`, `body`, and" + " `image` fields. *Custom CSS* can be added to change the appearance of posts. " + "This allows to mimic the original platform appearance. Custom CSS can be inserted " + "below. For some data sources, pre-made templates are available. These be toggled " + "below. If no custom or pre-made CSS is available, a general template is used." + "tsts" + }, # Web tool settings # These are used by the FlaskConfig class in config.py # Flask may require a restart to update them diff --git a/datasources/bitchute/search_bitchute.py b/datasources/bitchute/search_bitchute.py index 37849891c..b42d317b3 100644 --- a/datasources/bitchute/search_bitchute.py +++ b/datasources/bitchute/search_bitchute.py @@ -94,9 +94,7 @@ class SearchBitChute(Search): "type": UserInput.OPTION_TEXT_LARGE, "help": "Bitchute CSS", "default": "", - "tooltip": "Custom CSS for Bitchute posts in the the Explorer. This allows to " - "mimic the original platform appearance. If empty, use the default " - "CSS template (which is also editable on this page)." + "tooltip": "Add custom styling for Bitchute posts in the the Explorer." } } diff --git a/datasources/douban/search_douban.py b/datasources/douban/search_douban.py index 841bb6037..2a7d1a23f 100644 --- a/datasources/douban/search_douban.py +++ b/datasources/douban/search_douban.py @@ -78,11 +78,9 @@ class SearchDouban(Search): config = { "explorer.douban-search-explorer-css": { "type": UserInput.OPTION_TEXT_LARGE, - "help": "TCAT CSS", + "help": "Douban CSS", "default": "", - "tooltip": "Custom CSS for Douban posts in the the Explorer. This allows to " - "mimic the original platform appearance. If empty, use the default " - "CSS template (which is also editable on this page)." + "tooltip": "Add custom styling for Douban posts in the the Explorer." } } diff --git a/datasources/douyin/search_douyin.py b/datasources/douyin/search_douyin.py index b53aab2a4..9d926123d 100644 --- a/datasources/douyin/search_douyin.py +++ b/datasources/douyin/search_douyin.py @@ -33,9 +33,7 @@ class SearchDouyin(Search): "type": UserInput.OPTION_TEXT_LARGE, "help": "Douyin CSS", "default": "", - "tooltip": "Custom CSS for Douyin posts in the the Explorer. This allows to " - "mimic the original platform appearance. If empty, use the default " - "CSS template (which is also editable on this page)." + "tooltip": "Add custom styling for Douyin posts in the the Explorer." } } diff --git a/datasources/eightchan/search_8chan.py b/datasources/eightchan/search_8chan.py index fb8970808..57b92987a 100644 --- a/datasources/eightchan/search_8chan.py +++ b/datasources/eightchan/search_8chan.py @@ -109,12 +109,10 @@ class Search8Chan(Search4Chan): "default": [], "global": True }, - "explorer.eightchan-explorer-css": { + "explorer.eightchan-search-explorer-css": { "type": UserInput.OPTION_TEXT_LARGE, "help": "8chan CSS", "default": "", - "tooltip": "Custom CSS for 8chan posts in the the Explorer. This allows to " - "mimic the original platform appearance. If empty, use the default " - "CSS template (which is also editable on this page)." + "tooltip": "Add custom styling for 8chan posts in the the Explorer." } } diff --git a/datasources/eightkun/search_8kun.py b/datasources/eightkun/search_8kun.py index 647434f6f..333daa55e 100644 --- a/datasources/eightkun/search_8kun.py +++ b/datasources/eightkun/search_8kun.py @@ -112,12 +112,10 @@ class Search8Kun(Search4Chan): "default": [], "global": True }, - "explorer.eightkun-explorer-css": { + "explorer.eightkun-search-explorer-css": { "type": UserInput.OPTION_TEXT_LARGE, "help": "8kun CSS", "default": "", - "tooltip": "Custom CSS for 8kun posts in the the Explorer. This allows to " - "mimic the original platform appearance. If empty, use the default " - "CSS template (which is also editable on this page)." + "tooltip": "Add custom styling for 8kun posts in the the Explorer." } } \ No newline at end of file diff --git a/datasources/fourchan/search_4chan.py b/datasources/fourchan/search_4chan.py index 7e8638caf..d0bfb8d84 100644 --- a/datasources/fourchan/search_4chan.py +++ b/datasources/fourchan/search_4chan.py @@ -443,13 +443,11 @@ class Search4Chan(SearchWithScope): "default": False, "tooltip": "Allows users to query the 4chan data without specifying a keyword. This can lead to HUGE datasets!" }, - "explorer.fourchan-explorer-css": { + "explorer.fourchan-search-explorer-css": { "type": UserInput.OPTION_TEXT_LARGE, "help": "4chan CSS", "default": "", - "tooltip": "Custom CSS for 4chan posts in the the Explorer. This allows to " - "mimic the original platform appearance. If empty, use the default " - "CSS template (which is also editable on this page)." + "tooltip": "Add custom styling for 4chan posts in the the Explorer." } } diff --git a/datasources/instagram/search_instagram.py b/datasources/instagram/search_instagram.py index 0118aef36..32a7a75d6 100644 --- a/datasources/instagram/search_instagram.py +++ b/datasources/instagram/search_instagram.py @@ -36,9 +36,7 @@ class SearchInstagram(Search): "type": UserInput.OPTION_TEXT_LARGE, "help": "Instagram CSS", "default": "", - "tooltip": "Custom CSS for Instagram posts in the the Explorer. This allows to " - "mimic the original platform appearance. If empty, use the default " - "CSS template (which is also editable on this page)." + "tooltip": "Add custom styling for Instagram posts in the the Explorer." } } diff --git a/datasources/reddit/search_reddit.py b/datasources/reddit/search_reddit.py index be21608bb..ead44b142 100644 --- a/datasources/reddit/search_reddit.py +++ b/datasources/reddit/search_reddit.py @@ -115,15 +115,7 @@ class SearchReddit(Search): "help": "Can query without keyword", "default": False, "tooltip": "Allows users to query Pushshift without specifying a keyword. This can lead to HUGE datasets!" - }, - "explorer.reddit-search-explorer-css": { - "type": UserInput.OPTION_TEXT_LARGE, - "help": "Reddit CSS", - "default": "", - "tooltip": "Custom CSS for Reddit posts in the the Explorer. This allows to " - "mimic the original platform appearance. If empty, use the default " - "CSS template (which is also editable on this page)." - } + } } # These change depending on the API type used, diff --git a/datasources/telegram/search_telegram.py b/datasources/telegram/search_telegram.py index 2b30aa07f..a32789f19 100644 --- a/datasources/telegram/search_telegram.py +++ b/datasources/telegram/search_telegram.py @@ -67,13 +67,17 @@ class SearchTelegram(Search): "tooltip": "Amount of entities that can be queried at a time. Entities are groups or channels. 0 to " "disable limit." }, + "explorer.telegram-search-explorer-default-css": { + "type": UserInput.OPTION_TOGGLE, + "help": "Use Telegram default CSS", + "default": "", + "tooltip": "Add custom styling for Telegram posts in the the Explorer." + }, "explorer.telegram-search-explorer-css": { "type": UserInput.OPTION_TEXT_LARGE, - "help": "Telegram CSS", + "help": "Custom Telegram CSS", "default": "", - "tooltip": "Custom CSS for Telegram posts in the the Explorer. This allows to " - "mimic the original platform appearance. If empty, use the default " - "CSS template (which is also editable on this page)." + "tooltip": "Add custom styling for Telegram posts in the the Explorer." } } diff --git a/webtool/views/views_explorer.py b/webtool/views/views_explorer.py index 62c90df4e..b02dcf1c2 100644 --- a/webtool/views/views_explorer.py +++ b/webtool/views/views_explorer.py @@ -133,8 +133,6 @@ def explorer_dataset(key, page): # can also be changed in the 4CAT control panel under the 'Explorer' # settings. css = get_custom_css(datasource) - print(datasource) - print("CSS", css) # Include custom fields if it they are in the datasource's 'explorer' dir. # The file's naming format should e.g. be 'reddit-explorer.json'. From a921967749fd14c3aaf7ddaf8df80a4a539da675 Mon Sep 17 00:00:00 2001 From: sal-phd-desktop Date: Tue, 9 Apr 2024 17:35:03 +0200 Subject: [PATCH 007/204] Forgot to save these --- common/lib/config_definition.py | 1 - datasources/telegram/search_telegram.py | 6 +++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/common/lib/config_definition.py b/common/lib/config_definition.py index 42e42b083..d113f8c09 100644 --- a/common/lib/config_definition.py +++ b/common/lib/config_definition.py @@ -331,7 +331,6 @@ "This allows to mimic the original platform appearance. Custom CSS can be inserted " "below. For some data sources, pre-made templates are available. These be toggled " "below. If no custom or pre-made CSS is available, a general template is used." - "tsts" }, # Web tool settings # These are used by the FlaskConfig class in config.py diff --git a/datasources/telegram/search_telegram.py b/datasources/telegram/search_telegram.py index a32789f19..60db3d76e 100644 --- a/datasources/telegram/search_telegram.py +++ b/datasources/telegram/search_telegram.py @@ -69,9 +69,9 @@ class SearchTelegram(Search): }, "explorer.telegram-search-explorer-default-css": { "type": UserInput.OPTION_TOGGLE, - "help": "Use Telegram default CSS", - "default": "", - "tooltip": "Add custom styling for Telegram posts in the the Explorer." + "help": "Use default Telegram CSS", + "default": True + "tooltip": "See " }, "explorer.telegram-search-explorer-css": { "type": UserInput.OPTION_TEXT_LARGE, From e37ebc97a1f29132b291dcb073f73744493307b8 Mon Sep 17 00:00:00 2001 From: Sal Hagen Date: Wed, 10 Apr 2024 11:49:47 +0200 Subject: [PATCH 008/204] Typozzz --- common/lib/config_definition.py | 16 ++++++++-------- datasources/bitchute/search_bitchute.py | 2 +- datasources/douban/search_douban.py | 2 +- datasources/douyin/search_douyin.py | 2 +- datasources/eightchan/search_8chan.py | 2 +- datasources/eightkun/search_8kun.py | 2 +- datasources/fourchan/search_4chan.py | 2 +- datasources/instagram/search_instagram.py | 2 +- datasources/linkedin/search_linkedin.py | 2 +- datasources/parler/search_parler.py | 2 +- datasources/telegram/search_telegram.py | 2 +- datasources/tiktok/search_tiktok.py | 2 +- datasources/tiktok_urls/search_tiktok_urls.py | 2 +- datasources/tumblr/search_tumblr.py | 2 +- datasources/twitter-import/search_twitter.py | 2 +- datasources/twitterv2/search_twitter.py | 2 +- datasources/vk/search_vk.py | 2 +- 17 files changed, 24 insertions(+), 24 deletions(-) diff --git a/common/lib/config_definition.py b/common/lib/config_definition.py index d113f8c09..1e5d5949d 100644 --- a/common/lib/config_definition.py +++ b/common/lib/config_definition.py @@ -323,14 +323,14 @@ "explorer._explanation_custom_fields": { "type": UserInput.OPTION_INFO, "help": "You can customise how posts per data source appear in the Explorer. " - "This involves *custom fields* via a JSON that points to what fields should " - "be displayed. These fields can also be formatted in a specific ways, for " - "instance as a URL or together with specific icons. If this JSON is absent, " - "the Explorer by default shows the `author`, `subject`, `timestamp`, `body`, and" - " `image` fields. *Custom CSS* can be added to change the appearance of posts. " - "This allows to mimic the original platform appearance. Custom CSS can be inserted " - "below. For some data sources, pre-made templates are available. These be toggled " - "below. If no custom or pre-made CSS is available, a general template is used." + "This involves *custom fields*; a JSON that points to what fields should " + "be displayed. These fields can be formatted, for instance as a URL or together " + " with specific icons. If this JSON is absent, the Explorer by default shows the " + "`author`, `subject`, `timestamp`, `body`, and `image` fields. *Custom CSS* can be " + "added to change the appearance of posts. This allows to mimic the original platform " + "appearance. Custom CSS can be inserted below. For some data sources, pre-made templates " + "are available. These can be toggled below. If no custom or pre-made CSS is available, a " + "general template is used." }, # Web tool settings # These are used by the FlaskConfig class in config.py diff --git a/datasources/bitchute/search_bitchute.py b/datasources/bitchute/search_bitchute.py index b42d317b3..28a899237 100644 --- a/datasources/bitchute/search_bitchute.py +++ b/datasources/bitchute/search_bitchute.py @@ -94,7 +94,7 @@ class SearchBitChute(Search): "type": UserInput.OPTION_TEXT_LARGE, "help": "Bitchute CSS", "default": "", - "tooltip": "Add custom styling for Bitchute posts in the the Explorer." + "tooltip": "Add custom styling for Bitchute posts in the Explorer." } } diff --git a/datasources/douban/search_douban.py b/datasources/douban/search_douban.py index 2a7d1a23f..704fd8a23 100644 --- a/datasources/douban/search_douban.py +++ b/datasources/douban/search_douban.py @@ -80,7 +80,7 @@ class SearchDouban(Search): "type": UserInput.OPTION_TEXT_LARGE, "help": "Douban CSS", "default": "", - "tooltip": "Add custom styling for Douban posts in the the Explorer." + "tooltip": "Add custom styling for Douban posts in the Explorer." } } diff --git a/datasources/douyin/search_douyin.py b/datasources/douyin/search_douyin.py index 9d926123d..ebf9b4450 100644 --- a/datasources/douyin/search_douyin.py +++ b/datasources/douyin/search_douyin.py @@ -33,7 +33,7 @@ class SearchDouyin(Search): "type": UserInput.OPTION_TEXT_LARGE, "help": "Douyin CSS", "default": "", - "tooltip": "Add custom styling for Douyin posts in the the Explorer." + "tooltip": "Add custom styling for Douyin posts in the Explorer." } } diff --git a/datasources/eightchan/search_8chan.py b/datasources/eightchan/search_8chan.py index 57b92987a..fdc3fc555 100644 --- a/datasources/eightchan/search_8chan.py +++ b/datasources/eightchan/search_8chan.py @@ -113,6 +113,6 @@ class Search8Chan(Search4Chan): "type": UserInput.OPTION_TEXT_LARGE, "help": "8chan CSS", "default": "", - "tooltip": "Add custom styling for 8chan posts in the the Explorer." + "tooltip": "Add custom styling for 8chan posts in the Explorer." } } diff --git a/datasources/eightkun/search_8kun.py b/datasources/eightkun/search_8kun.py index 333daa55e..e32c4d4e2 100644 --- a/datasources/eightkun/search_8kun.py +++ b/datasources/eightkun/search_8kun.py @@ -116,6 +116,6 @@ class Search8Kun(Search4Chan): "type": UserInput.OPTION_TEXT_LARGE, "help": "8kun CSS", "default": "", - "tooltip": "Add custom styling for 8kun posts in the the Explorer." + "tooltip": "Add custom styling for 8kun posts in the Explorer." } } \ No newline at end of file diff --git a/datasources/fourchan/search_4chan.py b/datasources/fourchan/search_4chan.py index d0bfb8d84..8a54812be 100644 --- a/datasources/fourchan/search_4chan.py +++ b/datasources/fourchan/search_4chan.py @@ -447,7 +447,7 @@ class Search4Chan(SearchWithScope): "type": UserInput.OPTION_TEXT_LARGE, "help": "4chan CSS", "default": "", - "tooltip": "Add custom styling for 4chan posts in the the Explorer." + "tooltip": "Add custom styling for 4chan posts in the Explorer." } } diff --git a/datasources/instagram/search_instagram.py b/datasources/instagram/search_instagram.py index 32a7a75d6..fa22cedaf 100644 --- a/datasources/instagram/search_instagram.py +++ b/datasources/instagram/search_instagram.py @@ -36,7 +36,7 @@ class SearchInstagram(Search): "type": UserInput.OPTION_TEXT_LARGE, "help": "Instagram CSS", "default": "", - "tooltip": "Add custom styling for Instagram posts in the the Explorer." + "tooltip": "Add custom styling for Instagram posts in the Explorer." } } diff --git a/datasources/linkedin/search_linkedin.py b/datasources/linkedin/search_linkedin.py index 99c2e8efb..65df1d55b 100644 --- a/datasources/linkedin/search_linkedin.py +++ b/datasources/linkedin/search_linkedin.py @@ -36,7 +36,7 @@ class SearchLinkedIn(Search): "type": UserInput.OPTION_TEXT_LARGE, "help": "LinkedIn CSS", "default": "", - "tooltip": "Custom CSS for LinkedIn posts in the the Explorer. This allows to " + "tooltip": "Custom CSS for LinkedIn posts in the Explorer. This allows to " "mimic the original platform appearance. If empty, use the default " "CSS template (which is also editable on this page)." } diff --git a/datasources/parler/search_parler.py b/datasources/parler/search_parler.py index 3ceb95b3a..fab89e8ae 100644 --- a/datasources/parler/search_parler.py +++ b/datasources/parler/search_parler.py @@ -32,7 +32,7 @@ class SearchParler(Search): "type": UserInput.OPTION_TEXT_LARGE, "help": "Parler CSS", "default": "", - "tooltip": "Custom CSS for Parler posts in the the Explorer. This allows to " + "tooltip": "Custom CSS for Parler posts in the Explorer. This allows to " "mimic the original platform appearance. If empty, use the default " "CSS template (which is also editable on this page)." } diff --git a/datasources/telegram/search_telegram.py b/datasources/telegram/search_telegram.py index 60db3d76e..e8496c1ab 100644 --- a/datasources/telegram/search_telegram.py +++ b/datasources/telegram/search_telegram.py @@ -77,7 +77,7 @@ class SearchTelegram(Search): "type": UserInput.OPTION_TEXT_LARGE, "help": "Custom Telegram CSS", "default": "", - "tooltip": "Add custom styling for Telegram posts in the the Explorer." + "tooltip": "Add custom styling for Telegram posts in the Explorer." } } diff --git a/datasources/tiktok/search_tiktok.py b/datasources/tiktok/search_tiktok.py index 29e082769..6aff822cc 100644 --- a/datasources/tiktok/search_tiktok.py +++ b/datasources/tiktok/search_tiktok.py @@ -35,7 +35,7 @@ class SearchTikTok(Search): "type": UserInput.OPTION_TEXT_LARGE, "help": "Tiktok CSS", "default": "", - "tooltip": "Custom CSS for Tiktok posts in the the Explorer. This allows to " + "tooltip": "Custom CSS for Tiktok posts in the Explorer. This allows to " "mimic the original platform appearance. If empty, use the default " "CSS template (which is also editable on this page)." } diff --git a/datasources/tiktok_urls/search_tiktok_urls.py b/datasources/tiktok_urls/search_tiktok_urls.py index 8a61d0f92..82e8b1f1b 100644 --- a/datasources/tiktok_urls/search_tiktok_urls.py +++ b/datasources/tiktok_urls/search_tiktok_urls.py @@ -51,7 +51,7 @@ class SearchTikTokByID(Search): "type": UserInput.OPTION_TEXT_LARGE, "help": "Tiktok URLs CSS", "default": "", - "tooltip": "Custom CSS for Tiktok URLs posts in the the Explorer. This allows to " + "tooltip": "Custom CSS for Tiktok URLs posts in the Explorer. This allows to " "mimic the original platform appearance. If empty, use the default " "CSS template (which is also editable on this page)." } diff --git a/datasources/tumblr/search_tumblr.py b/datasources/tumblr/search_tumblr.py index 07f6a394d..0dc72c04a 100644 --- a/datasources/tumblr/search_tumblr.py +++ b/datasources/tumblr/search_tumblr.py @@ -77,7 +77,7 @@ class SearchTumblr(Search): "type": UserInput.OPTION_TEXT_LARGE, "help": "CSS Tumblr", "default": "", - "tooltip": "Custom CSS for Tumblr posts in the the Explorer. This allows to " + "tooltip": "Custom CSS for Tumblr posts in the Explorer. This allows to " "mimic the original platform appearance. If empty, use the default " "CSS template (which is also editable on this page)." } diff --git a/datasources/twitter-import/search_twitter.py b/datasources/twitter-import/search_twitter.py index b08854e40..b1d5a25d1 100644 --- a/datasources/twitter-import/search_twitter.py +++ b/datasources/twitter-import/search_twitter.py @@ -35,7 +35,7 @@ class SearchTwitterViaZeeschuimer(Search): "type": UserInput.OPTION_TEXT_LARGE, "help": "Twitter import CSS", "default": "", - "tooltip": "Custom CSS for Twitter import posts in the the Explorer. This allows to " + "tooltip": "Custom CSS for Twitter import posts in the Explorer. This allows to " "mimic the original platform appearance. If empty, use the default " "CSS template (which is also editable on this page)." } diff --git a/datasources/twitterv2/search_twitter.py b/datasources/twitterv2/search_twitter.py index fe3069d0b..76f7395bb 100644 --- a/datasources/twitterv2/search_twitter.py +++ b/datasources/twitterv2/search_twitter.py @@ -66,7 +66,7 @@ class SearchWithTwitterAPIv2(Search): "type": UserInput.OPTION_TEXT_LARGE, "help": "Twitter CSS", "default": "", - "tooltip": "Custom CSS for Twitter posts in the the Explorer. This allows to " + "tooltip": "Custom CSS for Twitter posts in the Explorer. This allows to " "mimic the original platform appearance. If empty, use the default " "CSS template (which is also editable on this page)." } diff --git a/datasources/vk/search_vk.py b/datasources/vk/search_vk.py index 9efc09a85..f4b42421e 100644 --- a/datasources/vk/search_vk.py +++ b/datasources/vk/search_vk.py @@ -36,7 +36,7 @@ class SearchVK(Search): "type": UserInput.OPTION_TEXT_LARGE, "help": "VK import CSS", "default": "", - "tooltip": "Custom CSS for VK import posts in the the Explorer. This allows to " + "tooltip": "Custom CSS for VK import posts in the Explorer. This allows to " "mimic the original platform appearance. If empty, use the default " "CSS template (which is also editable on this page)." } From a7668f0061461f69f98bc11a8ec7b3992df342b8 Mon Sep 17 00:00:00 2001 From: sal-phd-desktop Date: Wed, 10 Apr 2024 17:28:40 +0200 Subject: [PATCH 009/204] First setup for dynamic Explorer options in Settings --- common/lib/config_definition.py | 72 +++++++++++++++++++++++-- datasources/telegram/search_telegram.py | 13 +---- 2 files changed, 70 insertions(+), 15 deletions(-) diff --git a/common/lib/config_definition.py b/common/lib/config_definition.py index 1e5d5949d..fec3f680d 100644 --- a/common/lib/config_definition.py +++ b/common/lib/config_definition.py @@ -305,7 +305,7 @@ "global": True }, # Explorer settings - "explorer.__max_posts": { + "explorer._max_posts": { "type": UserInput.OPTION_TEXT, "default": 100000, "help": "Amount of posts", @@ -313,14 +313,26 @@ "tooltip": "Maximum number of posts to be considered by the Explorer (prevents timeouts and " "memory errors)" }, - "explorer.__posts_per_page": { + "explorer.posts_per_page": { "type": UserInput.OPTION_TEXT, "default": 50, "help": "Posts per page", "coerce_type": int, "tooltip": "Number of posts to display per page" }, - "explorer._explanation_custom_fields": { + "explorer.explanation_custom_fields": { + "type": UserInput.OPTION_INFO, + "help": "You can customise how posts per data source appear in the Explorer. " + "This involves *custom fields*; a JSON that points to what fields should " + "be displayed. These fields can be formatted, for instance as a URL or together " + " with specific icons. If this JSON is absent, the Explorer by default shows the " + "`author`, `subject`, `timestamp`, `body`, and `image` fields. *Custom CSS* can be " + "added to change the appearance of posts. This allows to mimic the original platform " + "appearance. Custom CSS can be inserted below. For some data sources, pre-made templates " + "are available. These can be toggled below. If no custom or pre-made CSS is available, a " + "general template is used." + }, + "explorer.explanation_custom_fields": { "type": UserInput.OPTION_INFO, "help": "You can customise how posts per data source appear in the Explorer. " "This involves *custom fields*; a JSON that points to what fields should " @@ -520,6 +532,60 @@ }, } +# Dynamically add some Explorer options per data source. +# These are all the same, so we're looping over +# data sources to avoid redunancy. +modules = ["4chan", "telegram"] +for module in modules: + print(module) + # Explorer custom fields: default template, data source preset, or custom. + explorer_options = { + "explorer." + module + "-fields": { + "type": UserInput.OPTION_CHOICE, + "help": module + " fields", + "options": { + "general": "General fields", + "custom": "Custom (insert below)" + }, + "default": "general" + }, + # Custom Explorer fields JSON + "explorer." + module + "-fields-json": { + "type": UserInput.OPTION_TEXT_LARGE, + "help": "Custom " + module + " fields", + "default": "", + "tooltip": "Add custom fields for " + module + " posts in the Explorer." + }, + # Explorer CSS: default template, data source preset, or custom. + "explorer." + module + "-css": { + "type": UserInput.OPTION_CHOICE, + "help": module + " CSS", + "options": { + "general": "General fields", + "custom": "Custom (insert below)" + }, + "default": "general" + }, + # Custom Explorer CSS + "explorer." + module + "-css-text": { + "type": UserInput.OPTION_TEXT_LARGE, + "help": "Custom " + module + " CSS", + "default": "", + "tooltip": "Add custom styling for " + module + " posts in the Explorer." + } + } + + # If this data source has preset custom fields and CSS stylesheets + # (which must be signalled via the `has_explorer_preset` attribute in the + # data source script), we're adding the default option to select this preset. + if module: + explorer_options["explorer." + module + "-fields"]["options"]["preset"] = "Data source preset" + explorer_options["explorer." + module + "-fields"]["default"] = "preset" + explorer_options["explorer." + module + "-css"]["options"]["preset"] = "Data source preset" + explorer_options["explorer." + module + "-css"]["default"] = "preset" + + config_definition = {**config_definition, **explorer_options} + # These are used in the web interface for more readable names # Can't think of a better place to put them... categories = { diff --git a/datasources/telegram/search_telegram.py b/datasources/telegram/search_telegram.py index e8496c1ab..477cd9999 100644 --- a/datasources/telegram/search_telegram.py +++ b/datasources/telegram/search_telegram.py @@ -39,6 +39,7 @@ class SearchTelegram(Search): extension = "ndjson" # extension of result file, used internally and in UI is_local = False # Whether this datasource is locally scraped is_static = False # Whether this datasource is still updated + has_explorer_preset = True # Whether this data source has preset CSS and field settings for the Explorer # cache details_cache = None @@ -66,18 +67,6 @@ class SearchTelegram(Search): "default": 25, "tooltip": "Amount of entities that can be queried at a time. Entities are groups or channels. 0 to " "disable limit." - }, - "explorer.telegram-search-explorer-default-css": { - "type": UserInput.OPTION_TOGGLE, - "help": "Use default Telegram CSS", - "default": True - "tooltip": "See " - }, - "explorer.telegram-search-explorer-css": { - "type": UserInput.OPTION_TEXT_LARGE, - "help": "Custom Telegram CSS", - "default": "", - "tooltip": "Add custom styling for Telegram posts in the Explorer." } } From 59e33b0c83736fdeabf32964ff658c6e6a69be8f Mon Sep 17 00:00:00 2001 From: Sal Hagen Date: Mon, 15 Apr 2024 11:40:37 +0200 Subject: [PATCH 010/204] First steps to datasource table user input --- common/lib/config_definition.py | 37 +++++++++++++- common/lib/user_input.py | 8 +++ datasources/bitchute/search_bitchute.py | 9 ---- datasources/douban/search_douban.py | 9 ---- datasources/douyin/search_douyin.py | 11 +--- datasources/eightchan/search_8chan.py | 8 +-- datasources/eightkun/search_8kun.py | 8 +-- datasources/fourchan/search_4chan.py | 8 +-- datasources/instagram/search_instagram.py | 9 ---- datasources/linkedin/search_linkedin.py | 11 ---- datasources/parler/search_parler.py | 11 ---- datasources/telegram/search_telegram.py | 12 ----- datasources/tiktok/search_tiktok.py | 11 ---- datasources/tiktok_urls/search_tiktok_urls.py | 10 +--- datasources/tumblr/search_tumblr.py | 8 --- datasources/twitter-import/search_twitter.py | 13 +---- datasources/twitterv2/search_twitter.py | 10 +--- datasources/vk/search_vk.py | 13 +---- webtool/lib/template_filters.py | 4 +- .../components/datasource-option.html | 50 +++++++++++++++++++ webtool/templates/controlpanel/config.html | 1 + 21 files changed, 104 insertions(+), 157 deletions(-) diff --git a/common/lib/config_definition.py b/common/lib/config_definition.py index 1e5d5949d..d43ed9d99 100644 --- a/common/lib/config_definition.py +++ b/common/lib/config_definition.py @@ -305,7 +305,7 @@ "global": True }, # Explorer settings - "explorer.__max_posts": { + "explorer._max_posts": { "type": UserInput.OPTION_TEXT, "default": 100000, "help": "Amount of posts", @@ -313,13 +313,46 @@ "tooltip": "Maximum number of posts to be considered by the Explorer (prevents timeouts and " "memory errors)" }, - "explorer.__posts_per_page": { + "explorer._posts_per_page": { "type": UserInput.OPTION_TEXT, "default": 50, "help": "Posts per page", "coerce_type": int, "tooltip": "Number of posts to display per page" }, + "explorer.explorer_config": { + "type": UserInput.OPTION_DATASOURCES_TABLE, + "help": "Explorer settings per data source", + "default": {"fourchan": {"enabled": True, "css": "preset", "fields": "custom", "test": "TEST"}, "tumblr": {"enabled": True, "css": "preset", "fields": "custom"}}, + "columns": { + "enabled": { + "type": "toggle", + "help": "Enable Explorer" + }, + "name": { + "type": "text", + "help": "Test" + }, + "fields": { + "type": "choice", + "help": "Fields", + "options": { + "general": "Default fields", + "preset": "Data source preset", + "custom": "Custom (insert below)" + } + }, + "css": { + "type": "choice", + "help": "CSS", + "options": { + "general": "Default template", + "preset": "Data source preset", + "custom": "Custom (insert below)" + } + } + } + }, "explorer._explanation_custom_fields": { "type": UserInput.OPTION_INFO, "help": "You can customise how posts per data source appear in the Explorer. " diff --git a/common/lib/user_input.py b/common/lib/user_input.py index 248421b5c..9d9996f11 100644 --- a/common/lib/user_input.py +++ b/common/lib/user_input.py @@ -35,6 +35,8 @@ class UserInput: OPTION_FILE = "file" # file upload OPTION_HUE = "hue" # colour hue OPTION_DATASOURCES = "datasources" # data source toggling + OPTION_DATASOURCES_TABLE = "datasources_table" # a table with settings per data source + OPTION_DATASOURCES_TEXT = "datasources_text" # text input per data source (via dropdown) OPTIONS_COSMETIC = (OPTION_INFO, OPTION_DIVIDER) @@ -142,6 +144,9 @@ def parse_all(options, input, silently_correct=True): parsed_input[option] = [datasource for datasource, v in datasources.items() if v["enabled"]] parsed_input[option.split(".")[0] + ".expiration"] = datasources + elif settings.get("type") == UserInput.OPTION_DATASOURCES_TABLE: + # special case, loop through a table to generate a JSON + print("yea") elif option not in input: # not provided? use default @@ -338,6 +343,9 @@ def parse_value(settings, choice, other_input=None, silently_correct=True): else: return choice + elif input_type == UserInput.DATASOURCES_TABLE: + return "weeird" + else: # no filtering return choice diff --git a/datasources/bitchute/search_bitchute.py b/datasources/bitchute/search_bitchute.py index 28a899237..c15540a50 100644 --- a/datasources/bitchute/search_bitchute.py +++ b/datasources/bitchute/search_bitchute.py @@ -89,15 +89,6 @@ class SearchBitChute(Search): } - config = { - "explorer.bitchute-search-explorer-css": { - "type": UserInput.OPTION_TEXT_LARGE, - "help": "Bitchute CSS", - "default": "", - "tooltip": "Add custom styling for Bitchute posts in the Explorer." - } - } - def get_items(self, query): """ Run custom search diff --git a/datasources/douban/search_douban.py b/datasources/douban/search_douban.py index 704fd8a23..0fb983fbe 100644 --- a/datasources/douban/search_douban.py +++ b/datasources/douban/search_douban.py @@ -75,15 +75,6 @@ class SearchDouban(Search): } } - config = { - "explorer.douban-search-explorer-css": { - "type": UserInput.OPTION_TEXT_LARGE, - "help": "Douban CSS", - "default": "", - "tooltip": "Add custom styling for Douban posts in the Explorer." - } - } - def get_items(self, query): """ Get Douban posts diff --git a/datasources/douyin/search_douyin.py b/datasources/douyin/search_douyin.py index ebf9b4450..3997cd74c 100644 --- a/datasources/douyin/search_douyin.py +++ b/datasources/douyin/search_douyin.py @@ -27,16 +27,7 @@ class SearchDouyin(Search): "[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)", "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)" ] - - config = { - "explorer.douyin-search-explorer-css": { - "type": UserInput.OPTION_TEXT_LARGE, - "help": "Douyin CSS", - "default": "", - "tooltip": "Add custom styling for Douyin posts in the Explorer." - } - } - + def get_items(self, query): """ Run custom search diff --git a/datasources/eightchan/search_8chan.py b/datasources/eightchan/search_8chan.py index fdc3fc555..b3d6702b8 100644 --- a/datasources/eightchan/search_8chan.py +++ b/datasources/eightchan/search_8chan.py @@ -108,11 +108,5 @@ class Search8Chan(Search4Chan): "tooltip": "These boards will not be scraped, but can still be indexed if added to 'Boards to index'", "default": [], "global": True - }, - "explorer.eightchan-search-explorer-css": { - "type": UserInput.OPTION_TEXT_LARGE, - "help": "8chan CSS", - "default": "", - "tooltip": "Add custom styling for 8chan posts in the Explorer." - } + } } diff --git a/datasources/eightkun/search_8kun.py b/datasources/eightkun/search_8kun.py index e32c4d4e2..e54e69d3f 100644 --- a/datasources/eightkun/search_8kun.py +++ b/datasources/eightkun/search_8kun.py @@ -111,11 +111,5 @@ class Search8Kun(Search4Chan): "tooltip": "These boards will not be scraped, but can still be indexed if added to 'Boards to index'", "default": [], "global": True - }, - "explorer.eightkun-search-explorer-css": { - "type": UserInput.OPTION_TEXT_LARGE, - "help": "8kun CSS", - "default": "", - "tooltip": "Add custom styling for 8kun posts in the Explorer." - } + } } \ No newline at end of file diff --git a/datasources/fourchan/search_4chan.py b/datasources/fourchan/search_4chan.py index 8a54812be..7b69b872e 100644 --- a/datasources/fourchan/search_4chan.py +++ b/datasources/fourchan/search_4chan.py @@ -442,13 +442,7 @@ class Search4Chan(SearchWithScope): "help": "Can query without keyword", "default": False, "tooltip": "Allows users to query the 4chan data without specifying a keyword. This can lead to HUGE datasets!" - }, - "explorer.fourchan-search-explorer-css": { - "type": UserInput.OPTION_TEXT_LARGE, - "help": "4chan CSS", - "default": "", - "tooltip": "Add custom styling for 4chan posts in the Explorer." - } + } } def get_items_simple(self, query): diff --git a/datasources/instagram/search_instagram.py b/datasources/instagram/search_instagram.py index fa22cedaf..daa42471d 100644 --- a/datasources/instagram/search_instagram.py +++ b/datasources/instagram/search_instagram.py @@ -31,15 +31,6 @@ class SearchInstagram(Search): "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok) (also covers usage with Instagram)" ] - config = { - "explorer.instagram-search-explorer-css": { - "type": UserInput.OPTION_TEXT_LARGE, - "help": "Instagram CSS", - "default": "", - "tooltip": "Add custom styling for Instagram posts in the Explorer." - } - } - # some magic numbers instagram uses MEDIA_TYPE_PHOTO = 1 MEDIA_TYPE_VIDEO = 2 diff --git a/datasources/linkedin/search_linkedin.py b/datasources/linkedin/search_linkedin.py index 65df1d55b..ef29353d4 100644 --- a/datasources/linkedin/search_linkedin.py +++ b/datasources/linkedin/search_linkedin.py @@ -31,17 +31,6 @@ class SearchLinkedIn(Search): "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok) (also explains general usage of Zeeschuimer)" ] - config = { - "explorer.linkedin-search-explorer-css": { - "type": UserInput.OPTION_TEXT_LARGE, - "help": "LinkedIn CSS", - "default": "", - "tooltip": "Custom CSS for LinkedIn posts in the Explorer. This allows to " - "mimic the original platform appearance. If empty, use the default " - "CSS template (which is also editable on this page)." - } - } - def get_items(self, query): """ Run custom search diff --git a/datasources/parler/search_parler.py b/datasources/parler/search_parler.py index fab89e8ae..07b6116ce 100644 --- a/datasources/parler/search_parler.py +++ b/datasources/parler/search_parler.py @@ -27,17 +27,6 @@ class SearchParler(Search): # not available as a processor for existing datasets accepts = [None] - config = { - "explorer.parler-search-explorer-css": { - "type": UserInput.OPTION_TEXT_LARGE, - "help": "Parler CSS", - "default": "", - "tooltip": "Custom CSS for Parler posts in the Explorer. This allows to " - "mimic the original platform appearance. If empty, use the default " - "CSS template (which is also editable on this page)." - } - } - def get_items(self, query): """ Run custom search diff --git a/datasources/telegram/search_telegram.py b/datasources/telegram/search_telegram.py index e8496c1ab..e0e9bb142 100644 --- a/datasources/telegram/search_telegram.py +++ b/datasources/telegram/search_telegram.py @@ -66,18 +66,6 @@ class SearchTelegram(Search): "default": 25, "tooltip": "Amount of entities that can be queried at a time. Entities are groups or channels. 0 to " "disable limit." - }, - "explorer.telegram-search-explorer-default-css": { - "type": UserInput.OPTION_TOGGLE, - "help": "Use default Telegram CSS", - "default": True - "tooltip": "See " - }, - "explorer.telegram-search-explorer-css": { - "type": UserInput.OPTION_TEXT_LARGE, - "help": "Custom Telegram CSS", - "default": "", - "tooltip": "Add custom styling for Telegram posts in the Explorer." } } diff --git a/datasources/tiktok/search_tiktok.py b/datasources/tiktok/search_tiktok.py index 6aff822cc..b3214bc42 100644 --- a/datasources/tiktok/search_tiktok.py +++ b/datasources/tiktok/search_tiktok.py @@ -30,17 +30,6 @@ class SearchTikTok(Search): "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)" ] - config = { - "explorer.tiktok-search-explorer-css": { - "type": UserInput.OPTION_TEXT_LARGE, - "help": "Tiktok CSS", - "default": "", - "tooltip": "Custom CSS for Tiktok posts in the Explorer. This allows to " - "mimic the original platform appearance. If empty, use the default " - "CSS template (which is also editable on this page)." - } - } - def get_items(self, query): """ Run custom search diff --git a/datasources/tiktok_urls/search_tiktok_urls.py b/datasources/tiktok_urls/search_tiktok_urls.py index 82e8b1f1b..d8864be91 100644 --- a/datasources/tiktok_urls/search_tiktok_urls.py +++ b/datasources/tiktok_urls/search_tiktok_urls.py @@ -46,15 +46,7 @@ class SearchTikTokByID(Search): "default": 1.0, "help": "Request wait", "tooltip": "Time to wait before sending a new request from the same IP" - }, - "explorer.tiktok-urls-search-explorer-css": { - "type": UserInput.OPTION_TEXT_LARGE, - "help": "Tiktok URLs CSS", - "default": "", - "tooltip": "Custom CSS for Tiktok URLs posts in the Explorer. This allows to " - "mimic the original platform appearance. If empty, use the default " - "CSS template (which is also editable on this page)." - } + } } options = { diff --git a/datasources/tumblr/search_tumblr.py b/datasources/tumblr/search_tumblr.py index 0dc72c04a..191fec22e 100644 --- a/datasources/tumblr/search_tumblr.py +++ b/datasources/tumblr/search_tumblr.py @@ -72,14 +72,6 @@ class SearchTumblr(Search): 'default': "", 'help': 'Tumblr API Secret Key', 'tooltip': "", - }, - "explorer.tumblr-search-explorer-css": { - "type": UserInput.OPTION_TEXT_LARGE, - "help": "CSS Tumblr", - "default": "", - "tooltip": "Custom CSS for Tumblr posts in the Explorer. This allows to " - "mimic the original platform appearance. If empty, use the default " - "CSS template (which is also editable on this page)." } } references = ["[Tumblr API documentation](https://www.tumblr.com/docs/en/api/v2)"] diff --git a/datasources/twitter-import/search_twitter.py b/datasources/twitter-import/search_twitter.py index b1d5a25d1..7984cc69b 100644 --- a/datasources/twitter-import/search_twitter.py +++ b/datasources/twitter-import/search_twitter.py @@ -29,18 +29,7 @@ class SearchTwitterViaZeeschuimer(Search): "[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)", "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)" ] - - config = { - "explorer.twitter-import-explorer-css": { - "type": UserInput.OPTION_TEXT_LARGE, - "help": "Twitter import CSS", - "default": "", - "tooltip": "Custom CSS for Twitter import posts in the Explorer. This allows to " - "mimic the original platform appearance. If empty, use the default " - "CSS template (which is also editable on this page)." - } - } - + def get_items(self, query): """ Run custom search diff --git a/datasources/twitterv2/search_twitter.py b/datasources/twitterv2/search_twitter.py index 76f7395bb..a3dbb4482 100644 --- a/datasources/twitterv2/search_twitter.py +++ b/datasources/twitterv2/search_twitter.py @@ -61,15 +61,7 @@ class SearchWithTwitterAPIv2(Search): "tooltip": "If enabled, allow users to enter a list of tweet IDs " "to retrieve. This is disabled by default because it " "can be confusing to novice users." - }, - "explorer.twitter-search-explorer-css": { - "type": UserInput.OPTION_TEXT_LARGE, - "help": "Twitter CSS", - "default": "", - "tooltip": "Custom CSS for Twitter posts in the Explorer. This allows to " - "mimic the original platform appearance. If empty, use the default " - "CSS template (which is also editable on this page)." - } + } } def get_items(self, query): diff --git a/datasources/vk/search_vk.py b/datasources/vk/search_vk.py index f4b42421e..22c5581a9 100644 --- a/datasources/vk/search_vk.py +++ b/datasources/vk/search_vk.py @@ -30,18 +30,7 @@ class SearchVK(Search): "[VK API documentation](https://vk.com/dev/first_guide)", "[Python API wrapper](https://github.com/python273/vk_api)" ] - - config = { - "explorer.vk-import-explorer-css": { - "type": UserInput.OPTION_TEXT_LARGE, - "help": "VK import CSS", - "default": "", - "tooltip": "Custom CSS for VK import posts in the Explorer. This allows to " - "mimic the original platform appearance. If empty, use the default " - "CSS template (which is also editable on this page)." - } - } - + expanded_profile_fields = "id,screen_name,first_name,last_name,name,deactivated,is_closed,is_admin,sex,city,country,photo_200,photo_100,photo_50,followers_count,members_count" # https://vk.com/dev/objects/user & https://vk.com/dev/objects/group @classmethod diff --git a/webtool/lib/template_filters.py b/webtool/lib/template_filters.py index 9cbe1897e..dd04d3eb1 100644 --- a/webtool/lib/template_filters.py +++ b/webtool/lib/template_filters.py @@ -230,8 +230,8 @@ def _jinja2_filter_4chan_image(image_4chan, post_id, board, image_md5): -@app.template_filter('post_field') -def _jinja2_filter_post_field(field, post): +@app.template_filter('dict_field') +def _jinja2_filter_dict_field(field, post): # Extracts string values between {{ two curly brackets }} and uses that # as a dictionary key for the given dict. It then returns the corresponding value. # Mainly used in the Explorer. diff --git a/webtool/templates/components/datasource-option.html b/webtool/templates/components/datasource-option.html index 7c6c788de..32d414360 100644 --- a/webtool/templates/components/datasource-option.html +++ b/webtool/templates/components/datasource-option.html @@ -147,6 +147,56 @@ + {% elif settings.type == "datasources_table" %} + {{settings}} +
+ {{option}} +
+ {{options.option}} +
+ + + + + + {% for column_id, column in settings.columns.items() %} + + {% endfor %} + + + + {% for datasource, datasource_config in datasources_config.items() %} + {% if datasource_config.enabled %} + + {% if datasource in settings.default %} + {% set default_settings = settings.default[datasource] %} + {% else %} + {% default_settings = None %} + {% endif %} + + + {% for column_id, column in settings.columns.items() %} + + {% endfor %} + + {% endif %} + {% endfor %} + +
Data source{{ column.help }}
{{ datasource_config.name }} + {% if column.type == "text" %} + + } + {% elif column.type == "toggle" %} + + {% elif column.type == "choice" %} + + {% endif %} +
+
{% endif %} diff --git a/webtool/templates/controlpanel/config.html b/webtool/templates/controlpanel/config.html index ca5a7930d..cdc19d453 100644 --- a/webtool/templates/controlpanel/config.html +++ b/webtool/templates/controlpanel/config.html @@ -77,6 +77,7 @@

{% else %} {% set settings = options[option] %} {% endif %} + {% include 'components/datasource-option.html' %} {% endfor %} From 46628c62a6e9f720100000ddfc9c74f97dbad45e Mon Sep 17 00:00:00 2001 From: sal-phd-desktop Date: Mon, 15 Apr 2024 17:19:43 +0200 Subject: [PATCH 011/204] Add basic UserInput.DATASOURCES_TABLE functionality, and use in Explorer settings page --- common/lib/config_definition.py | 75 +++++++++++-------- common/lib/user_input.py | 19 +++-- .../components/datasource-option.html | 38 +++++----- 3 files changed, 77 insertions(+), 55 deletions(-) diff --git a/common/lib/config_definition.py b/common/lib/config_definition.py index d43ed9d99..482b5969a 100644 --- a/common/lib/config_definition.py +++ b/common/lib/config_definition.py @@ -305,7 +305,13 @@ "global": True }, # Explorer settings - "explorer._max_posts": { + "explorer.__basic-explanation": { + "type": UserInput.OPTION_INFO, + "help": "4CAT's Explorer feature lets you navigate and annotate datasets as if they " + "appared on their original platform. This is intended to facilitate qualitative " + "exploration and manual coding." + }, + "explorer.__max_posts": { "type": UserInput.OPTION_TEXT, "default": 100000, "help": "Amount of posts", @@ -313,58 +319,65 @@ "tooltip": "Maximum number of posts to be considered by the Explorer (prevents timeouts and " "memory errors)" }, - "explorer._posts_per_page": { + "explorer.__posts_per_page": { "type": UserInput.OPTION_TEXT, "default": 50, "help": "Posts per page", "coerce_type": int, "tooltip": "Number of posts to display per page" }, + "explorer._config_explanation": { + "type": UserInput.OPTION_INFO, + "help": "Per data source, you can enable/disable the Explorer and customise how posts appear. " + "The latter involves *what fields to show* and *how posts are styled*. Many data sources have tailored " + "[presets for this](https://github.com/digitalmethodsinitiative/4cat/tree/master/webtool/static/" + "explorer-presets). If presets are unavailable, a general template for [fields](https://github.com/" + "digitalmethodsinitiative/4cat/tree/master/webtool/static/explorer-presets/default-fields.json) " + "and [CSS styling](https://github.com/digitalmethodsinitiative/4cat/tree/master/webtool/static/" + "explorer-presets/default-css.css) is used. You can also toggle between data source presets and the general " + "template via the table below." + }, + # "explorer._config_explanation2": { + # "type": UserInput.OPTION_INFO, + # "help": "Alternatively, you can also *customise fields and CSS yourself* by choosing the `Custom` setting in the " + # "table below and inserting JSON and CSS values in the text boxes underneath. See the [wiki for instructions " + # "on how to format custom fields and CSS](https://github.com/digitalmethodsinitiative/4cat/wiki/" + # "Exploring-and-annotating-datasets#add-custom-fields)." + # }, "explorer.explorer_config": { "type": UserInput.OPTION_DATASOURCES_TABLE, "help": "Explorer settings per data source", - "default": {"fourchan": {"enabled": True, "css": "preset", "fields": "custom", "test": "TEST"}, "tumblr": {"enabled": True, "css": "preset", "fields": "custom"}}, + "default": {"fourchan": {"enabled": True, "css": "preset", "fields": "general"}, "eightchan": {"enabled": True, "css": "general", "fields": "general"}, "eightkun": {"enabled": True, "css": "general", "fields": "general"}, "ninegag": {"enabled": True, "css": "general", "fields": "general"}, "bitchute": {"enabled": True, "css": "general", "fields": "general"}, "dmi-tcat": {"enabled": True, "css": "general", "fields": "general"}, "dmi-tcatv2": {"enabled": True, "css": "general", "fields": "general"}, "douban": {"enabled": True, "css": "general", "fields": "general"}, "douyin": {"enabled": False, "css": "general", "fields": "general"}, "imgur": {"enabled": True, "css": "general", "fields": "general"}, "upload": {"enabled": True, "css": "general", "fields": "general"}, "instagram": {"enabled": True, "css": "preset", "fields": "preset"}, "linkedin": {"enabled": True, "css": "general", "fields": "general"}, "parler": {"enabled": True, "css": "general", "fields": "general"}, "reddit": {"enabled": True, "css": "preset", "fields": "preset"}, "telegram": {"enabled": True, "css": "general", "fields": "general"}, "tiktok": {"enabled": True, "css": "preset", "fields": "preset"}, "tiktok-urls": {"enabled": True, "css": "preset", "fields": "preset"}, "tumblr": {"enabled": True, "css": "preset", "fields": "preset"}, "twitter": {"enabled": True, "css": "preset", "fields": "preset"}, "twitterv2": {"enabled": True, "css": "preset", "fields": "preset"}, "usenet": {"enabled": True, "css": "general", "fields": "general"}, "vk": {"enabled": True, "css": "general", "fields": "general"}}, "columns": { "enabled": { - "type": "toggle", - "help": "Enable Explorer" - }, - "name": { - "type": "text", - "help": "Test" + "type": UserInput.OPTION_TOGGLE, + "help": "Enable", + "tooltip": "Whether the Explorer is available for this data source", + "default": True }, "fields": { - "type": "choice", + "type": UserInput.OPTION_CHOICE, "help": "Fields", "options": { - "general": "Default fields", - "preset": "Data source preset", - "custom": "Custom (insert below)" - } + "general": "General", + "preset": "Preset" + }, + "default": "general", + "tooltip": "What fields to use (see explanation above)" }, "css": { - "type": "choice", + "type": UserInput.OPTION_CHOICE, "help": "CSS", "options": { - "general": "Default template", - "preset": "Data source preset", - "custom": "Custom (insert below)" - } + "general": "General", + "preset": "Preset" + }, + "default": "general", + "tooltip": "What CSS styling to use (see explanation above)" } } }, - "explorer._explanation_custom_fields": { - "type": UserInput.OPTION_INFO, - "help": "You can customise how posts per data source appear in the Explorer. " - "This involves *custom fields*; a JSON that points to what fields should " - "be displayed. These fields can be formatted, for instance as a URL or together " - " with specific icons. If this JSON is absent, the Explorer by default shows the " - "`author`, `subject`, `timestamp`, `body`, and `image` fields. *Custom CSS* can be " - "added to change the appearance of posts. This allows to mimic the original platform " - "appearance. Custom CSS can be inserted below. For some data sources, pre-made templates " - "are available. These can be toggled below. If no custom or pre-made CSS is available, a " - "general template is used." - }, + "explorer" # Web tool settings # These are used by the FlaskConfig class in config.py # Flask may require a restart to update them diff --git a/common/lib/user_input.py b/common/lib/user_input.py index 9d9996f11..4de5478c2 100644 --- a/common/lib/user_input.py +++ b/common/lib/user_input.py @@ -144,9 +144,21 @@ def parse_all(options, input, silently_correct=True): parsed_input[option] = [datasource for datasource, v in datasources.items() if v["enabled"]] parsed_input[option.split(".")[0] + ".expiration"] = datasources + elif settings.get("type") == UserInput.OPTION_DATASOURCES_TABLE: - # special case, loop through a table to generate a JSON - print("yea") + # special case, parse table values to generate a dict + columns = list(settings["columns"].keys()) + table_input = {} + + for datasource in list(settings["default"].keys()): + table_input[datasource] = {} + for column in columns: + + choice = input.get(option + "-" + datasource + "-" + column, False) + column_settings = settings["columns"][column] # sub-settings per column + table_input[datasource][column] = UserInput.parse_value(column_settings, choice, table_input, silently_correct=True) + + parsed_input[option] = table_input elif option not in input: # not provided? use default @@ -343,9 +355,6 @@ def parse_value(settings, choice, other_input=None, silently_correct=True): else: return choice - elif input_type == UserInput.DATASOURCES_TABLE: - return "weeird" - else: # no filtering return choice diff --git a/webtool/templates/components/datasource-option.html b/webtool/templates/components/datasource-option.html index 32d414360..e52f917d5 100644 --- a/webtool/templates/components/datasource-option.html +++ b/webtool/templates/components/datasource-option.html @@ -148,44 +148,41 @@ {% elif settings.type == "datasources_table" %} - {{settings}} -
- {{option}} -
- {{options.option}} + {% set tooltips = {} %}
- {% for column_id, column in settings.columns.items() %} - + {% endfor %} {% for datasource, datasource_config in datasources_config.items() %} {% if datasource_config.enabled %} - - {% if datasource in settings.default %} - {% set default_settings = settings.default[datasource] %} - {% else %} - {% default_settings = None %} - {% endif %} {% for column_id, column in settings.columns.items() %}
Data source{{ column.help }}{{ column.help }} + {% if column.tooltip %} + + {% set x = tooltips.__setitem__(column_id, column.tooltip) %} + {% endif %} +
{{ datasource_config.name }} - {% if column.type == "text" %} - - } + {% set column_value = "" %} + {% if datasource in settings.default and settings.default[datasource][column_id] %} + {% set column_value = settings.default[datasource][column_id] %} + {% endif %} + {% if column.type == "string" %} + {% elif column.type == "toggle" %} - + {% elif column.type == "choice" %} - {% for value, label in column.options.items() %} - + {% endfor %} {% endif %} @@ -196,6 +193,9 @@ {% endfor %}
+ {% for tooltip, tooltip_text in tooltips.items() %} + + {% endfor %}
{% endif %} From 340d1ff50aee44d82ba1533936f8e7b0cb492738 Mon Sep 17 00:00:00 2001 From: sal-phd-desktop Date: Mon, 15 Apr 2024 17:41:51 +0200 Subject: [PATCH 012/204] Simplify config setting name --- common/lib/config_definition.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/lib/config_definition.py b/common/lib/config_definition.py index 482b5969a..636bb3c82 100644 --- a/common/lib/config_definition.py +++ b/common/lib/config_definition.py @@ -344,7 +344,7 @@ # "on how to format custom fields and CSS](https://github.com/digitalmethodsinitiative/4cat/wiki/" # "Exploring-and-annotating-datasets#add-custom-fields)." # }, - "explorer.explorer_config": { + "explorer.config": { "type": UserInput.OPTION_DATASOURCES_TABLE, "help": "Explorer settings per data source", "default": {"fourchan": {"enabled": True, "css": "preset", "fields": "general"}, "eightchan": {"enabled": True, "css": "general", "fields": "general"}, "eightkun": {"enabled": True, "css": "general", "fields": "general"}, "ninegag": {"enabled": True, "css": "general", "fields": "general"}, "bitchute": {"enabled": True, "css": "general", "fields": "general"}, "dmi-tcat": {"enabled": True, "css": "general", "fields": "general"}, "dmi-tcatv2": {"enabled": True, "css": "general", "fields": "general"}, "douban": {"enabled": True, "css": "general", "fields": "general"}, "douyin": {"enabled": False, "css": "general", "fields": "general"}, "imgur": {"enabled": True, "css": "general", "fields": "general"}, "upload": {"enabled": True, "css": "general", "fields": "general"}, "instagram": {"enabled": True, "css": "preset", "fields": "preset"}, "linkedin": {"enabled": True, "css": "general", "fields": "general"}, "parler": {"enabled": True, "css": "general", "fields": "general"}, "reddit": {"enabled": True, "css": "preset", "fields": "preset"}, "telegram": {"enabled": True, "css": "general", "fields": "general"}, "tiktok": {"enabled": True, "css": "preset", "fields": "preset"}, "tiktok-urls": {"enabled": True, "css": "preset", "fields": "preset"}, "tumblr": {"enabled": True, "css": "preset", "fields": "preset"}, "twitter": {"enabled": True, "css": "preset", "fields": "preset"}, "twitterv2": {"enabled": True, "css": "preset", "fields": "preset"}, "usenet": {"enabled": True, "css": "general", "fields": "general"}, "vk": {"enabled": True, "css": "general", "fields": "general"}}, From dfbe5f3478b08e95639ab6b477110eefb3ab6112 Mon Sep 17 00:00:00 2001 From: sal-phd-desktop Date: Mon, 15 Apr 2024 17:42:05 +0200 Subject: [PATCH 013/204] Only show Explorer when enabled per data source --- webtool/templates/components/result-result-row.html | 4 ++-- webtool/views/views_dataset.py | 7 +++++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/webtool/templates/components/result-result-row.html b/webtool/templates/components/result-result-row.html index 6a24e484d..7f71f0ef4 100644 --- a/webtool/templates/components/result-result-row.html +++ b/webtool/templates/components/result-result-row.html @@ -45,9 +45,9 @@ {% endif %}
  • - {% if __user_config("privileges.can_use_explorer") %} + {% if __user_config("privileges.can_use_explorer") and has_explorer %} - Explore + Explorer {% endif %} diff --git a/webtool/views/views_dataset.py b/webtool/views/views_dataset.py index 411173a7a..725a965c0 100644 --- a/webtool/views/views_dataset.py +++ b/webtool/views/views_dataset.py @@ -423,7 +423,7 @@ def show_result(key): datasources = backend.all_modules.datasources datasource_expiration = config.get("datasources.expiration", {}).get(datasource, {}) expires_datasource = False - can_unexpire = ((config.get('expire.allow_optout') and \ + can_unexpire = ((config.get("expire.allow_optout") and \ datasource_expiration.get("allow_optout", True)) or datasource_expiration.get("allow_optout", False)) \ and (current_user.is_admin or dataset.is_accessible_by(current_user, "owner")) @@ -437,6 +437,8 @@ def show_result(key): elif dataset.parameters.get("expires-after"): timestamp_expires = dataset.parameters.get("expires-after") + has_explorer = config.get("explorer.config", {}).get(datasource, {}).get("enabled", False) + # if the dataset has parameters with credentials, give user the option to # erase them has_credentials = [p for p in dataset.parameters if p.startswith("api_") and p not in ("api_type", "api_track")] @@ -449,7 +451,8 @@ def show_result(key): return render_template(template, dataset=dataset, parent_key=dataset.key, processors=backend.all_modules.processors, is_processor_running=is_processor_running, messages=get_flashed_messages(), is_favourite=is_favourite, timestamp_expires=timestamp_expires, has_credentials=has_credentials, - expires_by_datasource=expires_datasource, can_unexpire=can_unexpire, datasources=datasources) + expires_by_datasource=expires_datasource, can_unexpire=can_unexpire, has_explorer=has_explorer, + datasources=datasources) @app.route('/results//processors/queue//', methods=["GET", "POST"]) From 28abb423e756693262998e614715232dbc5f28e9 Mon Sep 17 00:00:00 2001 From: sal-phd-desktop Date: Mon, 15 Apr 2024 19:26:21 +0200 Subject: [PATCH 014/204] First steps in integrating the Explorer more with the main interface --- webtool/static/css/stylesheet.css | 5 ++ .../components/datasource-option.html | 2 +- webtool/templates/explorer/controls.html | 54 +++++++++++++ webtool/templates/explorer/explorer.html | 79 ++++++++----------- webtool/templates/explorer/footer.html | 1 - webtool/templates/explorer/header.html | 54 ------------- webtool/views/views_explorer.py | 33 ++++---- 7 files changed, 109 insertions(+), 119 deletions(-) create mode 100644 webtool/templates/explorer/controls.html delete mode 100644 webtool/templates/explorer/footer.html delete mode 100644 webtool/templates/explorer/header.html diff --git a/webtool/static/css/stylesheet.css b/webtool/static/css/stylesheet.css index acc4409be..0fd5d7733 100644 --- a/webtool/static/css/stylesheet.css +++ b/webtool/static/css/stylesheet.css @@ -1205,4 +1205,9 @@ ol.result-list li.has_results .property-container.analysis a { .result-list .child-list > li { padding: 0; margin: 0.5em 0 0 0; +} + +#explorer-posts, #explorer-posts > ol li { + all: initial; + padding: 0; } \ No newline at end of file diff --git a/webtool/templates/components/datasource-option.html b/webtool/templates/components/datasource-option.html index e52f917d5..51e26bf16 100644 --- a/webtool/templates/components/datasource-option.html +++ b/webtool/templates/components/datasource-option.html @@ -153,7 +153,7 @@ - + {% for column_id, column in settings.columns.items() %}
    Data sourceEnabled data sources{{ column.help }} {% if column.tooltip %} diff --git a/webtool/templates/explorer/controls.html b/webtool/templates/explorer/controls.html new file mode 100644 index 000000000..f0bcd793d --- /dev/null +++ b/webtool/templates/explorer/controls.html @@ -0,0 +1,54 @@ + +
    +

    + {{ dataset.get_label() }} +

    + + {% if key %} +
    + {% if post_count > max_posts %} +

    Large dataset - only showing the first {{ max_posts }} posts. Use filter processors to limit the dataset.

    + {% set post_count = max_posts %} + {% endif %} +
    + + {% elif thread %} +

    Showing {{ post_count }} posts from {{ datasource }}/{{ board }} thread {{ thread }}.

    +

    Note that the archived posts may not be complete.

    + {% endif %} + +
    + +

    Showing posts {{ offset + 1 }} - {{ post_count if (offset + posts_per_page) > post_count else (offset + posts_per_page) }} ({{ post_count }} in total).

    +{% if custom_fields and custom_fields[0] == "invalid" %} +

    Invalid custom fields JSON - can't show posts properly ({{ custom_fields[1] }}).

    +{% endif %} +{% if custom_fields and 'sort_options' in custom_fields %} +
    +

    Sort posts by: + +

    +
    +{% endif %} \ No newline at end of file diff --git a/webtool/templates/explorer/explorer.html b/webtool/templates/explorer/explorer.html index 9a7251990..6cde4748b 100644 --- a/webtool/templates/explorer/explorer.html +++ b/webtool/templates/explorer/explorer.html @@ -1,48 +1,37 @@ - - - 4CAT Explorer • {% if parameters and parameters.get("label") %}{{ parameters.get("label") }}{% elif key %}{{ key }}{% elif thread %}{{ thread }}{% endif %} - - - - - - - - - - - - - - - - - - - - - - {% if custom_css %} - - {% endif %} - - - -
    - {% include "explorer/header.html" %} - - {% if not thread %} - {% include "explorer/annotations.html" %} - {% endif %} -
    +{% extends "layout.html" %} + +{% block title %}Datasets & previous results{% endblock %} +{% block body_class %}result-list plain-page{% endblock %} +{% block breadcrumbs %}{% set navigation.current = "dataset" %}{% endblock %} + +{% block body %} + + + + + + + + + + +{% if custom_css %} + +{% endif %} + +{% include "explorer/controls.html" %} + +{% if not thread %} + {% include "explorer/annotations.html" %} +{% endif %}
    {% include "explorer/nav-pages.html" %} -
      +
        {% for post in posts %} {% include "explorer/post.html" %} {% endfor %} @@ -50,8 +39,4 @@ {% include "explorer/nav-pages.html" %}
    -
    - {% include "explorer/footer.html" %} -
    - - \ No newline at end of file +{% endblock %} \ No newline at end of file diff --git a/webtool/templates/explorer/footer.html b/webtool/templates/explorer/footer.html deleted file mode 100644 index 258a61e98..000000000 --- a/webtool/templates/explorer/footer.html +++ /dev/null @@ -1 +0,0 @@ -

    Rendered by 4CAT

    \ No newline at end of file diff --git a/webtool/templates/explorer/header.html b/webtool/templates/explorer/header.html deleted file mode 100644 index ab3472fa7..000000000 --- a/webtool/templates/explorer/header.html +++ /dev/null @@ -1,54 +0,0 @@ -

    - - Return to dataset - - - - 4CAT Explorer {% if parameters and parameters.get("label") %} • {{ parameters.get("label") }}{% elif thread %} • {{ thread }}{% endif %} - -

    -{{ key }} -
    - {% if key %} -
    - {% if post_count > max_posts %} -

    Large dataset - only showing the first {{ max_posts }} posts. Use filter processors to limit the dataset.

    - {% set post_count = max_posts %} - {% endif %} -

    Showing posts {{ offset + 1 }} - {{ post_count if (offset + posts_per_page) > post_count else (offset + posts_per_page) }} ({{ post_count }} in total).

    - {% if custom_fields and custom_fields[0] == "invalid" %} -

    Invalid custom fields JSON - can't show posts properly ({{ custom_fields[1] }}).

    - {% endif %} - {% if custom_fields and 'sort_options' in custom_fields %} -
    -

    Sort posts by: - -

    -
    - {% endif %} -
    - - {% elif thread %} -

    Showing {{ post_count }} posts from {{ datasource }}/{{ board }} thread {{ thread }}.

    -

    Note that the archived posts may not be complete.

    - {% endif %} - -
    \ No newline at end of file diff --git a/webtool/views/views_explorer.py b/webtool/views/views_explorer.py index b02dcf1c2..437b20f2c 100644 --- a/webtool/views/views_explorer.py +++ b/webtool/views/views_explorer.py @@ -28,8 +28,8 @@ config = ConfigWrapper(config, user=current_user, request=request) api_ratelimit = limiter.shared_limit("45 per minute", scope="api") -@app.route('/explorer/dataset//', defaults={'page': 0}) -@app.route('/explorer/dataset//') +@app.route('/results//explorer/', defaults={'page': 0}) +@app.route('/results//explorer/') @api_ratelimit @login_required @setting_required("privileges.can_use_explorer") @@ -50,7 +50,14 @@ def explorer_dataset(key, page): dataset = DataSet(key=key, db=db) except DataSetException: return error(404, error="Dataset not found.") + + # Load some variables + parameters = dataset.get_parameters() + datasource = parameters["datasource"] + post_count = int(dataset.data["num_rows"]) + annotation_fields = dataset.get_annotation_fields() + # See if we can actually serve this page if dataset.is_private and not (config.get("privileges.can_view_all_datasets") or dataset.is_accessible_by(current_user)): return error(403, error="This dataset is private.") @@ -61,6 +68,9 @@ def explorer_dataset(key, page): if not results_path: return error(404, error="This dataset didn't finish executing") + if datasource not in config.get("explorer.config") and not config["explorer.config"][datasource]["enabled"]: + return error(404, error="Explorer functionality disabled for %s" % datasource) + # The amount of posts to show on a page posts_per_page = config.get("explorer.posts_per_page", 50) @@ -70,19 +80,10 @@ def explorer_dataset(key, page): # The offset for posts depending on the current page offset = ((page - 1) * posts_per_page) if page else 0 - # Load some variables - parameters = dataset.get_parameters() - datasource = parameters["datasource"] - board = parameters.get("board", "") - post_count = int(dataset.data["num_rows"]) - annotation_fields = dataset.get_annotation_fields() - - # If the dataset is local, we can add some more features - # (like the ability to navigate to threads) - is_local = False # CHANGE LATER ///////////////////// - if datasource in list(all_modules.datasources.keys()): - is_local = True if all_modules.datasources[datasource].get("is_local") else False + # f the dataset is generated from an API-accessible database, we can add + # extra features like the ability to navigate across posts. + has_database = False # CHANGE LATER ///////////////////// # Check if we have to sort the data. sort_by = request.args.get("sort") @@ -121,7 +122,7 @@ def explorer_dataset(key, page): post_ids.append(row["id"]) posts.append(row) - # Stop if we exceed the allowed posts per page or max. posts. + # Stop if we exceed the allowed posts per page or max posts. if count >= (offset + posts_per_page) or count > max_posts: break @@ -161,7 +162,7 @@ def explorer_dataset(key, page): annotations = json.loads(annotations["annotations"]) # Generate the HTML page - return render_template("explorer/explorer.html", key=key, datasource=datasource, board=board, is_local=is_local, parameters=parameters, annotation_fields=annotation_fields, annotations=annotations, posts=posts, custom_css=css, custom_fields=custom_fields, page=page, offset=offset, posts_per_page=posts_per_page, post_count=post_count, max_posts=max_posts) + return render_template("explorer/explorer.html", key=key, datasource=datasource, has_database=has_database, parameters=parameters, annotation_fields=annotation_fields, annotations=annotations, dataset=dataset, posts=posts, custom_css=css, custom_fields=custom_fields, page=page, offset=offset, posts_per_page=posts_per_page, post_count=post_count, max_posts=max_posts) @app.route('/explorer/thread///') @api_ratelimit From 70d00b1a5a056fc1286b50c502f437ef32432fa9 Mon Sep 17 00:00:00 2001 From: sal-phd-desktop Date: Tue, 16 Apr 2024 17:52:16 +0200 Subject: [PATCH 015/204] First steps in bringing back sorting --- common/lib/dataset.py | 34 +++- common/lib/helpers.py | 18 ++- .../static/css/explorer/explorer-default.css | 145 +----------------- webtool/static/js/explorer.js | 6 +- webtool/templates/explorer/controls.html | 73 ++++----- webtool/templates/explorer/explorer.html | 9 +- webtool/templates/explorer/nav-pages.html | 61 -------- webtool/templates/explorer/pagination.html | 63 ++++++++ webtool/views/views_explorer.py | 89 ++++------- 9 files changed, 183 insertions(+), 315 deletions(-) delete mode 100644 webtool/templates/explorer/nav-pages.html create mode 100644 webtool/templates/explorer/pagination.html diff --git a/common/lib/dataset.py b/common/lib/dataset.py index a48e6f053..dd7a96eec 100644 --- a/common/lib/dataset.py +++ b/common/lib/dataset.py @@ -15,7 +15,7 @@ import backend from common.config_manager import config from common.lib.job import Job, JobNotFoundException -from common.lib.helpers import get_software_commit, NullAwareTextIOWrapper, convert_to_int +from common.lib.helpers import get_software_commit, NullAwareTextIOWrapper, convert_to_int, convert_to_float, flatten_dict from common.lib.item_mapping import MappedItem, MissingMappedField, DatasetItem from common.lib.fourcat_module import FourcatModule from common.lib.exceptions import (ProcessorInterruptedException, DataSetException, DataSetNotFoundException, @@ -235,7 +235,7 @@ def log(self, log): with log_path.open("a", encoding="utf-8") as outfile: outfile.write("%s: %s\n" % (datetime.datetime.now().strftime("%c"), log)) - def _iterate_items(self, processor=None): + def _iterate_items(self, processor=None, sort=None): """ A generator that iterates through a CSV or NDJSON file @@ -268,6 +268,14 @@ def _iterate_items(self, processor=None): wrapped_infile = NullAwareTextIOWrapper(infile, encoding="utf-8") reader = csv.DictReader(wrapped_infile, **csv_parameters) + # In some cases, we want to sort the dataset first. + if sort: + # Generate reader on the basis of sort value + # At the moment, this is very inefficient, but + # suffices for the few cases where `sort` is used. + sort_by_index = next(reader).index(sort) + reader = sorted(reader, key=lambda x: convert_to_float(x[sort_by_index]) if len(x) >= sort_by_index else 0, reverse=True) + for item in reader: if hasattr(processor, "interrupted") and processor.interrupted: raise ProcessorInterruptedException("Processor interrupted while iterating through CSV file") @@ -277,16 +285,28 @@ def _iterate_items(self, processor=None): elif path.suffix.lower() == ".ndjson": # In NDJSON format each line in the file is a self-contained JSON with path.open(encoding="utf-8") as infile: - for line in infile: - if hasattr(processor, "interrupted") and processor.interrupted: - raise ProcessorInterruptedException("Processor interrupted while iterating through NDJSON file") + # Sorting can't be done easily here, + # we have to loop through the entire JSON first. + # Don't enable for large files! + if sort: + + for line in sorted([json.loads(line) for line in infile], key=lambda x: convert_to_float(flatten_dict(x)[sort]), reverse=True): + if hasattr(processor, "interrupted") and processor.interrupted: + raise ProcessorInterruptedException("Processor interrupted while iterating through NDJSON file") + yield line + + else: + for line in infile: + if hasattr(processor, "interrupted") and processor.interrupted: + raise ProcessorInterruptedException("Processor interrupted while iterating through NDJSON file") + yield json.loads(line) else: raise NotImplementedError("Cannot iterate through %s file" % path.suffix) - def iterate_items(self, processor=None, warn_unmappable=True, map_missing="default"): + def iterate_items(self, processor=None, warn_unmappable=True, map_missing="default", sort=None): """ Generate mapped dataset items @@ -338,7 +358,7 @@ def iterate_items(self, processor=None, warn_unmappable=True, map_missing="defau item_mapper = True # Loop through items - for i, item in enumerate(self._iterate_items(processor)): + for i, item in enumerate(self._iterate_items(processor, sort=sort)): # Save original to yield original_item = item.copy() diff --git a/common/lib/helpers.py b/common/lib/helpers.py index d097e4b72..306c435a3 100644 --- a/common/lib/helpers.py +++ b/common/lib/helpers.py @@ -206,6 +206,22 @@ def convert_to_int(value, default=0): except (ValueError, TypeError): return default +def convert_to_float(value, default=0): + """ + Convert a value to a floating point, with a fallback + + The fallback is used if an Error is thrown during converstion to float. + This is a convenience function, but beats putting try-catches everywhere + we're using user input as a floating point number. + + :param value: Value to convert + :param int default: Default value, if conversion not possible + :return int: Converted value + """ + try: + return float(value) + except (ValueError, TypeError): + return default def timify_long(number): """ @@ -789,7 +805,7 @@ def flatten_dict(d: MutableMapping, parent_key: str = '', sep: str = '.'): Lists will be converted to json strings via json.dumps() :param MutableMapping d: Dictionary like object - :param str partent_key: The original parent key prepending future nested keys + :param str parent_key: The original parent key prepending future nested keys :param str sep: A seperator string used to combine parent and child keys :return dict: A new dictionary with the no nested values """ diff --git a/webtool/static/css/explorer/explorer-default.css b/webtool/static/css/explorer/explorer-default.css index a3a60c0b2..562f150d9 100644 --- a/webtool/static/css/explorer/explorer-default.css +++ b/webtool/static/css/explorer/explorer-default.css @@ -30,138 +30,6 @@ See https://github.com/digitalmethodsinitiative/4cat/wiki/Exploring-and-annotati src: url("../fonts/OpenSans-BoldItalic.ttf") } -/** --------------------- * - Basic HTML elements - * --------------------- */ -*, body, code, select, input, textarea { - font-family: 'Open Sans', 'Trebuchet MS', sans-serif; -} - -body { - background-color: #f9fbff; - margin: 0; - padding: 0; -} - -select, input, textarea { - background: var(--gray); - border: 1px solid var(--gray); - font-size: 14px; - padding: 0.25em; -} - -label { - font-size: 14px; -} - -button { - border: 2px solid var(--contrast-dark); - background: var(--contrast-bright); - color: var(--text); - border-radius: 0.5em; - font-size: 14px; - cursor: pointer; - padding: 0.25em 1em; -} - -button:hover { - background: var(--accent); - color: var(--contrast-bright); -} - -button.invalid, button.invalid:hover { - cursor: default; - background: var(--contrast-bright); - color: var(--text); - border-color: var(--gray-dark); -} - -textarea { - width: 340px; -} - -/** --------------------- * - Header - * --------------------- */ - -body > header { - width: 100%; - margin: 0; - padding: 0; -} - -body > header h1 { - box-shadow: 0 5px 10px #888; - margin: 0; - padding: 0; - font-size: 1.5em; - background: var(--contrast-dark); - color: var(--contrast-bright); - text-align: center; - font-size: 1.5em; - line-height: 1.5em; - font-weight: bold; - padding: 0.50em 0; - cursor: default; -} - -body > header #metadata { - font-size: 16px; - min-width: 640px; - max-width: 960px; - margin: 0 auto; - margin-top: 40px; - margin-bottom: 40px; - text-align: center; -} - -body > header #metadata #parameters > span { - font-family: monospace; - font-size: 12px; - display: inline-block; - background: white; - margin: 2px; - padding: 4px; - border: 1px solid black; - border-radius: 5px; - cursor: default; -} - -body > header .return a { - position: absolute; - left: 0; - padding-left: 12px; - color: white; - font-size: 0.6em; - text-decoration: none; -} - -#dataset-key { - display: none; -} - -/** --------------------- * - Navigation pages - * --------------------- */ -.nav-pages { - text-align: center; -} - -span.page { - display: inline-block; - padding: 10px; - min-width: 20px; - overflow: hidden; - color: black; - background-color: white; - font-family: monospace; - border: 1px solid black; -} - -span.page.selected { - color: white; - background-color: black; -} /** --------------------- * Posts @@ -265,7 +133,7 @@ span.divider { /** --------------------- * Annotations editor * --------------------- */ -#annotations-editor-container { +/*#annotations-editor-container { background: rgba(0, 0, 0, .4); display: none; height: 100%; @@ -500,13 +368,4 @@ li.post.op > .post-annotations { } .posts .external-url { -} - -/** --------------------- * - Footer - * --------------------- */ -footer { - text-align: center; - margin-top: 40px; - margin-bottom: 70px; -} \ No newline at end of file +}*/ \ No newline at end of file diff --git a/webtool/static/js/explorer.js b/webtool/static/js/explorer.js index 1ee2acc9a..9f0e05a88 100644 --- a/webtool/static/js/explorer.js +++ b/webtool/static/js/explorer.js @@ -709,7 +709,7 @@ const annotations = { error: function (error) { annotations.enableSaving(); $("#save-annotations").html(" Save annotations"); - alert("Could't save annotations"); + //alert("Could't save annotations"); console.log(error) } }); @@ -854,7 +854,9 @@ const page_functions = { force_int = "" } - window.location.href = getRelativeURL('explorer/dataset/' + $("#dataset-key").text() + "?sort=" + $(this).val() + sort_order + force_int); + let dataset_key = $("#dataset-key").text(); + alert(dataset_key) + window.location.href = getRelativeURL("result/" + dataset_key + "/explorer/?sort=" + $(this).val() + sort_order + force_int); }); // Change the dropdown sort option based on the URL parameter diff --git a/webtool/templates/explorer/controls.html b/webtool/templates/explorer/controls.html index f0bcd793d..aeaa5bebd 100644 --- a/webtool/templates/explorer/controls.html +++ b/webtool/templates/explorer/controls.html @@ -1,54 +1,43 @@ - -
    +

    {{ dataset.get_label() }}

    + {% if custom_fields and custom_fields[0] == "invalid" %} +
    + Invalid custom fields JSON - can't show posts properly ({{ custom_fields[1] }}). +
    + {% endif %} {% if key %} -
    - {% if post_count > max_posts %} -

    Large dataset - only showing the first {{ max_posts }} posts. Use filter processors to limit the dataset.

    - {% set post_count = max_posts %} - {% endif %} -
    - - {% elif thread %} -

    Showing {{ post_count }} posts from {{ datasource }}/{{ board }} thread {{ thread }}.

    -

    Note that the archived posts may not be complete.

    +
    + {% if post_count > max_posts %} +

    Large dataset - only showing the first {{ max_posts }} posts. Use filter processors to limit the dataset.

    + {% set post_count = max_posts %} + {% endif %} +
    {% endif %} -
    + {# some different info for views generated by a direct API call #} + {% if not key and has_database %} +

    Showing {{ post_count }} posts from {{ datasource }}.

    + {% else %} +

    Showing posts {{ offset + 1 }} - {{ post_count if (offset + posts_per_page) > post_count else (offset + posts_per_page) }} ({{ post_count }} in total).

    + {% endif %} -

    Showing posts {{ offset + 1 }} - {{ post_count if (offset + posts_per_page) > post_count else (offset + posts_per_page) }} ({{ post_count }} in total).

    -{% if custom_fields and custom_fields[0] == "invalid" %} -

    Invalid custom fields JSON - can't show posts properly ({{ custom_fields[1] }}).

    -{% endif %} -{% if custom_fields and 'sort_options' in custom_fields %} -
    -

    Sort posts by: - -

    +
    +

    Sort posts by: + +

    -{% endif %} \ No newline at end of file + +
    diff --git a/webtool/templates/explorer/explorer.html b/webtool/templates/explorer/explorer.html index 6cde4748b..4ae46f924 100644 --- a/webtool/templates/explorer/explorer.html +++ b/webtool/templates/explorer/explorer.html @@ -23,20 +23,25 @@ {% endif %} +{{ key }} + {% include "explorer/controls.html" %} +{% include "explorer/pagination.html" %} + {% if not thread %} {% include "explorer/annotations.html" %} {% endif %} +
    - {% include "explorer/nav-pages.html" %}
      {% for post in posts %} {% include "explorer/post.html" %} {% endfor %}
    - {% include "explorer/nav-pages.html" %}
    +{% include "explorer/pagination.html" %} + {% endblock %} \ No newline at end of file diff --git a/webtool/templates/explorer/nav-pages.html b/webtool/templates/explorer/nav-pages.html deleted file mode 100644 index b212f7d65..000000000 --- a/webtool/templates/explorer/nav-pages.html +++ /dev/null @@ -1,61 +0,0 @@ - \ No newline at end of file diff --git a/webtool/templates/explorer/pagination.html b/webtool/templates/explorer/pagination.html new file mode 100644 index 000000000..ebf522890 --- /dev/null +++ b/webtool/templates/explorer/pagination.html @@ -0,0 +1,63 @@ + \ No newline at end of file diff --git a/webtool/views/views_explorer.py b/webtool/views/views_explorer.py index 437b20f2c..9264745c5 100644 --- a/webtool/views/views_explorer.py +++ b/webtool/views/views_explorer.py @@ -28,13 +28,13 @@ config = ConfigWrapper(config, user=current_user, request=request) api_ratelimit = limiter.shared_limit("45 per minute", scope="api") -@app.route('/results//explorer/', defaults={'page': 0}) -@app.route('/results//explorer/') +@app.route('/result//explorer/', defaults={'page': 1}) +@app.route('/result//explorer/page/') @api_ratelimit @login_required @setting_required("privileges.can_use_explorer") @openapi.endpoint("explorer") -def explorer_dataset(key, page): +def explorer_dataset(key, page=1): """ Show posts from a dataset @@ -80,15 +80,14 @@ def explorer_dataset(key, page): # The offset for posts depending on the current page offset = ((page - 1) * posts_per_page) if page else 0 - - # f the dataset is generated from an API-accessible database, we can add + # If the dataset is generated from an API-accessible database, we can add # extra features like the ability to navigate across posts. - has_database = False # CHANGE LATER ///////////////////// + has_database = False # INTEGRATE LATER ///////////////////// # Check if we have to sort the data. - sort_by = request.args.get("sort") - if sort_by == "dataset-order": - sort_by = None + sort = request.args.get("sort") + if sort == "dataset-order": + sort = None # Check if we have to reverse the order. descending = request.args.get("desc") @@ -109,8 +108,10 @@ def explorer_dataset(key, page): posts = [] count = 0 + sort = "id" + try: - for row in dataset.iterate_items(warn_unmappable=False): + for row in dataset.iterate_items(warn_unmappable=False, sort=sort): count += 1 @@ -164,15 +165,14 @@ def explorer_dataset(key, page): # Generate the HTML page return render_template("explorer/explorer.html", key=key, datasource=datasource, has_database=has_database, parameters=parameters, annotation_fields=annotation_fields, annotations=annotations, dataset=dataset, posts=posts, custom_css=css, custom_fields=custom_fields, page=page, offset=offset, posts_per_page=posts_per_page, post_count=post_count, max_posts=max_posts) -@app.route('/explorer/thread///') +@app.route('/result///explorer') @api_ratelimit @login_required @setting_required("privileges.can_use_explorer") @openapi.endpoint("explorer") -def explorer_local_thread(datasource, board, thread_id): +def explorer_database_thread(datasource, board, thread_id): """ - Show a thread. This is only available for local data sources, - and will be depracated/changed in future updates. + Show a thread from an API-accessible database. :param str datasource: Data source ID :param str board: Board name @@ -217,11 +217,9 @@ def explorer_local_thread(datasource, board, thread_id): @login_required @setting_required("privileges.can_use_explorer") @openapi.endpoint("explorer") -def explorer_local_posts(datasource, board, thread_id): +def explorer_database_posts(datasource, board, thread_id): """ - Show a posts from a local data source. - This is only available for local data sources, - and will be depracated/changed in future updates. + Show posts from an API-accessible database. :param str datasource: Data source ID :param str board: Board name @@ -535,34 +533,25 @@ def iterate_items_with_sort(in_file, max_rows=None, sort_by=None, descending=Fal suffix = in_file.name.split(".")[-1].lower() - if suffix == "csv": - - with open(in_file, "r", encoding="utf-8") as dataset_file: - - # Sort on date by default - # Unix timestamp integers are not always saved in the same field. - reader = csv.reader(dataset_file) - columns = next(reader) - if sort_by: - try: - # Get index number of sort_by value - sort_by_index = columns.index(sort_by) - - # Generate reader on the basis of sort_by value - reader = sorted(reader, key=lambda x: to_float(x[sort_by_index], convert=force_int) if len(x) >= sort_by_index else 0, reverse=descending) + # Sort on data date by default + # Unix timestamp integers are not always saved in the same field. + reader = csv.reader(dataset_file) + columns = next(reader) + if sort_by: + try: + print("YES") + except (ValueError, IndexError) as e: + pass - except (ValueError, IndexError) as e: - pass + for item in reader: - for item in reader: + # Add columns + #item = {columns[i]: item[i] for i in range(len(item))} - # Add columns - item = {columns[i]: item[i] for i in range(len(item))} - - yield item - - elif suffix == "ndjson": + yield item + if suffix == "ndjson": + print("TRUEE") # In this format each line in the file is a self-contained JSON # file with open(in_file, "r", encoding="utf-8") as dataset_file: @@ -577,13 +566,7 @@ def iterate_items_with_sort(in_file, max_rows=None, sort_by=None, descending=Fal # If a sort order is given explicitly, we're sorting anyway. else: keys = sort_by.split(".") - - if max_rows: - for item in sorted([json.loads(line) for i, line in enumerate(dataset_file) if i < max_rows], key=lambda x: to_float(get_nested_value(x, keys), convert=force_int), reverse=descending): - yield item - else: - for item in sorted([json.loads(line) for line in dataset_file], key=lambda x: to_float(get_nested_value(x, keys), convert=force_int), reverse=descending): - yield item + yield item return Exception("Can't loop through file with extension %s" % suffix) @@ -689,14 +672,6 @@ def get_nested_value(di, keys): return 0 return di -def to_float(value, convert=False): - if convert: - if not value: - value = 0 - else: - value = float(value) - return value - def strip_html(post): post["body"] = strip_tags(post.get("body", "")) return post From e93736233546b446c2accd47d8307710762d4e7c Mon Sep 17 00:00:00 2001 From: Sal Hagen Date: Wed, 17 Apr 2024 12:49:36 +0200 Subject: [PATCH 016/204] More sorting stuff --- common/lib/dataset.py | 33 ++---- .../static/css/explorer/explorer-default.css | 4 + webtool/templates/explorer/controls.html | 14 ++- webtool/templates/explorer/explorer.html | 2 +- webtool/views/views_explorer.py | 103 +++++++----------- 5 files changed, 63 insertions(+), 93 deletions(-) diff --git a/common/lib/dataset.py b/common/lib/dataset.py index dd7a96eec..5eb49c37c 100644 --- a/common/lib/dataset.py +++ b/common/lib/dataset.py @@ -235,7 +235,7 @@ def log(self, log): with log_path.open("a", encoding="utf-8") as outfile: outfile.write("%s: %s\n" % (datetime.datetime.now().strftime("%c"), log)) - def _iterate_items(self, processor=None, sort=None): + def _iterate_items(self, processor=None): """ A generator that iterates through a CSV or NDJSON file @@ -268,14 +268,6 @@ def _iterate_items(self, processor=None, sort=None): wrapped_infile = NullAwareTextIOWrapper(infile, encoding="utf-8") reader = csv.DictReader(wrapped_infile, **csv_parameters) - # In some cases, we want to sort the dataset first. - if sort: - # Generate reader on the basis of sort value - # At the moment, this is very inefficient, but - # suffices for the few cases where `sort` is used. - sort_by_index = next(reader).index(sort) - reader = sorted(reader, key=lambda x: convert_to_float(x[sort_by_index]) if len(x) >= sort_by_index else 0, reverse=True) - for item in reader: if hasattr(processor, "interrupted") and processor.interrupted: raise ProcessorInterruptedException("Processor interrupted while iterating through CSV file") @@ -283,30 +275,19 @@ def _iterate_items(self, processor=None, sort=None): yield item elif path.suffix.lower() == ".ndjson": - # In NDJSON format each line in the file is a self-contained JSON + with path.open(encoding="utf-8") as infile: - # Sorting can't be done easily here, - # we have to loop through the entire JSON first. - # Don't enable for large files! - if sort: - - for line in sorted([json.loads(line) for line in infile], key=lambda x: convert_to_float(flatten_dict(x)[sort]), reverse=True): - if hasattr(processor, "interrupted") and processor.interrupted: - raise ProcessorInterruptedException("Processor interrupted while iterating through NDJSON file") - yield line + for line in infile: + if hasattr(processor, "interrupted") and processor.interrupted: + raise ProcessorInterruptedException("Processor interrupted while iterating through NDJSON file") - else: - for line in infile: - if hasattr(processor, "interrupted") and processor.interrupted: - raise ProcessorInterruptedException("Processor interrupted while iterating through NDJSON file") - yield json.loads(line) else: raise NotImplementedError("Cannot iterate through %s file" % path.suffix) - def iterate_items(self, processor=None, warn_unmappable=True, map_missing="default", sort=None): + def iterate_items(self, processor=None, warn_unmappable=True, map_missing="default"): """ Generate mapped dataset items @@ -358,7 +339,7 @@ def iterate_items(self, processor=None, warn_unmappable=True, map_missing="defau item_mapper = True # Loop through items - for i, item in enumerate(self._iterate_items(processor, sort=sort)): + for i, item in enumerate(self._iterate_items(processor)): # Save original to yield original_item = item.copy() diff --git a/webtool/static/css/explorer/explorer-default.css b/webtool/static/css/explorer/explorer-default.css index 562f150d9..f726d741c 100644 --- a/webtool/static/css/explorer/explorer-default.css +++ b/webtool/static/css/explorer/explorer-default.css @@ -133,6 +133,10 @@ span.divider { /** --------------------- * Annotations editor * --------------------- */ +#annotations-editor-container { + display: hidden; +} + /*#annotations-editor-container { background: rgba(0, 0, 0, .4); display: none; diff --git a/webtool/templates/explorer/controls.html b/webtool/templates/explorer/controls.html index aeaa5bebd..3ab99c3b0 100644 --- a/webtool/templates/explorer/controls.html +++ b/webtool/templates/explorer/controls.html @@ -31,11 +31,15 @@

    Sort posts by:

    diff --git a/webtool/templates/explorer/explorer.html b/webtool/templates/explorer/explorer.html index 4ae46f924..4ab1fd206 100644 --- a/webtool/templates/explorer/explorer.html +++ b/webtool/templates/explorer/explorer.html @@ -6,7 +6,7 @@ {% block body %} - + diff --git a/webtool/views/views_explorer.py b/webtool/views/views_explorer.py index 9264745c5..576c1e484 100644 --- a/webtool/views/views_explorer.py +++ b/webtool/views/views_explorer.py @@ -90,28 +90,20 @@ def explorer_dataset(key, page=1): sort = None # Check if we have to reverse the order. - descending = request.args.get("desc") - if descending == "true" or descending == True: - descending = True - else: - descending = False + reverse = True if request.args.get("desc") in ("true", True) else False # Check if we have to convert the sort value to an integer. - force_int = request.args.get("int") - if force_int == "true" or force_int == True: - force_int = True - else: - force_int = False + force_number = True if request.args.get("int") in ("true", True) else False # Load posts post_ids = [] posts = [] count = 0 - sort = "id" - - try: - for row in dataset.iterate_items(warn_unmappable=False, sort=sort): + # If we're sorting, we need to iterate over the entire + # dataset first. Else we can simply use `iterate_items`. + if not sort: + for row in dataset.iterate_items(warn_unmappable=False): count += 1 @@ -126,9 +118,15 @@ def explorer_dataset(key, page=1): # Stop if we exceed the allowed posts per page or max posts. if count >= (offset + posts_per_page) or count > max_posts: break - - except NotImplementedError: - return error(404) + else: + for row in sort_and_iterate_items(dataset, sort, reverse=reverse, warn_unmappable=False): + count += 1 + if count <= offset: + continue + post_ids.append(row["id"]) + posts.append(row) + if count >= (offset + posts_per_page) or count > max_posts: + break # Retrieve custom CSS if it is present in the datasource's config. # If not given, we use a standard template. This standard CSS template @@ -238,7 +236,7 @@ def explorer_database_posts(datasource, board, thread_id): return error(404, error="No thread ID provided") # Get the posts with this thread ID. - posts = get_local_posts(db, datasource, board=board, ids=tuple([thread_id]), threads=True, order_by=["id"]) + posts = get_database_posts(db, datasource, board=board, ids=tuple([thread_id]), threads=True, order_by=["id"]) posts = [strip_html(post) for post in posts] posts = [format(post) for post in posts] @@ -521,58 +519,41 @@ def get_image_file(img_file): return send_file(str(image_path)) -def iterate_items_with_sort(in_file, max_rows=None, sort_by=None, descending=False, force_int=False): +def sort_and_iterate_items(dataset, sort=None, reverse=False, force_number=False, **kwargs): """ Loop through both csv and NDJSON files. - :param in_file, str: The input file to read. - :param sort_by, str: The key that determines the sort order. - :param descending, bool: Whether to sort by descending values. - :param force_int, bool: Whether the sort value should be converted to an - integer. + This is basically a wrapper function for `iterate_items()` with the + added functionality of sorting a dataset. Because the Explorer is (currently) + the only feature that requires sorting, we define it here. + This first iterates through the entire file (with a max limit) to determine + an order. Then it yields items based on this order. + + :param key, str: The dataset object. + :param sort_by, str: The item key that determines the sort order. + :param reverse, bool: Whether to sort by largest values first. + :param force_number, bool: Whether the sort value should be converted to a + floating point number. """ - suffix = in_file.name.split(".")[-1].lower() - - # Sort on data date by default - # Unix timestamp integers are not always saved in the same field. - reader = csv.reader(dataset_file) - columns = next(reader) - if sort_by: - try: - print("YES") - except (ValueError, IndexError) as e: - pass - - for item in reader: - - # Add columns - #item = {columns[i]: item[i] for i in range(len(item))} - - yield item - - if suffix == "ndjson": - print("TRUEE") - # In this format each line in the file is a self-contained JSON - # file - with open(in_file, "r", encoding="utf-8") as dataset_file: + # Storing posts in the right order here + posts = [] - # Unfortunately we can't easily sort here. - # We're just looping through the file if no sort is given. - if not sort_by: - for line in dataset_file: - item = json.loads(line) - yield item + # Generate reader on the basis of sort value + # At the moment, this is very inefficient, but + # suffices for the few cases where `sort` is used. + #sort_by_index = next(reader).index(sort) + #reader = sorted(reader, key=lambda x: convert_to_float(x[sort_by_index]) if len(x) >= sort_by_index else 0, reverse=True) + #sorted([json.loads(line) for line in infile], key=lambda x: convert_to_float(flatten_dict(x)[sort]), reverse=True) - # If a sort order is given explicitly, we're sorting anyway. - else: - keys = sort_by.split(".") - yield item + for item in sorted(dataset.iterate_items(**kwargs), key=lambda x: x[sort]): + posts.append(item) - return Exception("Can't loop through file with extension %s" % suffix) + for post in posts: + yield post -def get_local_posts(db, datasource, ids, board="", threads=False, limit=0, offset=0, order_by=["timestamp"]): +def get_database_posts(db, datasource, ids, board="", threads=False, limit=0, offset=0, order_by=["timestamp"]): """ - Retrieve posts from a local data source based on post IDs. + Retrieve posts by ID from a database-accessible data source. """ if not ids: From 11eaaf945fa6205077ec736ba688027c01f5a0f4 Mon Sep 17 00:00:00 2001 From: sal-phd-desktop Date: Wed, 17 Apr 2024 17:46:23 +0200 Subject: [PATCH 017/204] Fix and simplify sorting, control box styling --- webtool/static/css/dataset-page.css | 7 ++ webtool/static/js/explorer.js | 75 ++++++++---------- .../explorer/annotations-editor.html | 55 ++++++++++++++ webtool/templates/explorer/annotations.html | 69 ----------------- webtool/templates/explorer/controls.html | 76 +++++++++++-------- webtool/templates/explorer/explorer.html | 7 +- webtool/views/views_explorer.py | 51 ++++++------- 7 files changed, 162 insertions(+), 178 deletions(-) create mode 100644 webtool/templates/explorer/annotations-editor.html delete mode 100644 webtool/templates/explorer/annotations.html diff --git a/webtool/static/css/dataset-page.css b/webtool/static/css/dataset-page.css index 1c7b908da..e82742bda 100644 --- a/webtool/static/css/dataset-page.css +++ b/webtool/static/css/dataset-page.css @@ -198,6 +198,12 @@ article.result > section:first-child { line-height: 1.3em; } +.button-like-small.disabled { + cursor: not-allowed; + opacity: 0.5; +} + + .dataset-owner-list li { display: inline-block; } @@ -225,6 +231,7 @@ article.result > section:first-child { background: var(--gray-light); border: 1px solid var(--gray-dark); font-size: 0.8em; + cursor: pointer; } .dataset-toolbox a:hover, a.button-like-small:hover { diff --git a/webtool/static/js/explorer.js b/webtool/static/js/explorer.js index 9f0e05a88..8c45e9b7e 100644 --- a/webtool/static/js/explorer.js +++ b/webtool/static/js/explorer.js @@ -37,7 +37,15 @@ const annotations = { $("#add-annotation-field").on("click", function(){$("#annotation-fields").append(annotations.getAnnotationsDiv);}); // Show and hide the annotations editor - $("#toggle-annotation-fields").on("click", function(){$("#annotations-editor-container").toggle();}); + $("#toggle-annotation-fields").on("click", function(){ + $("#annotations-editor-container").toggle(); + if ($("#annotation-controls-buttons").hasClass("hidden")) { + $(this).html(""); + } + else { + $(this).html(""); + } + }); $("#annotations-editor").click(function(e) { e.stopPropagation(); }); @@ -599,8 +607,7 @@ const annotations = { // If the query is accepted by the server. if (response == 'success') { $("#annotations-editor-container").hide(); - $("#save-annotation-fields").addClass("invalid") - $("#save-annotation-fields").prop("disabled", true); + $("#save-annotation-fields").addClass("disabled"); } // If the query is rejected by the server. @@ -695,7 +702,7 @@ const annotations = { annotations.enableSaving(); $("#save-annotations").html(" Annotations saved"); - $("#save-annotations").addClass("invalid").prop("disabled", true); + $("#save-annotations").addClass("disabled"); old_annotation_fields = $("#annotation-fields").html(); // alert(alert_message); } @@ -746,24 +753,24 @@ const annotations = { // So we just need to check whether they're there. if (Object.keys(annotation_fields).length < 1) { - $("#toggle-annotations").addClass("invalid"); + $("#toggle-annotations").addClass("disabled"); return false; } else { - $("#toggle-annotations").removeClass("invalid"); + $("#toggle-annotations").removeClass("disabled"); return true; } }, enableSaving: function(){ // Enable saving annotations to the database - $("#save-annotations, #save-to-dataset").removeClass("invalid").removeAttr("disabled"); + $("#save-annotations, #save-to-dataset").removeClass("disabled"); $("#save-annotations").html(" Save annotations"); }, disableSaving: function(){ // Disable saving annotations to the database - $("#save-annotations, #save-to-dataset").addClass("invalid").prop("disabled", true); + $("#save-annotations, #save-to-dataset").addClass("disabled"); }, warnEditor: function(warning) { @@ -778,13 +785,13 @@ const annotations = { toggleAnnotations: function() { let ta = $("#toggle-annotations"); - if (ta.hasClass("hidden")) { - ta.removeClass("hidden"); + if (ta.hasClass("shown")) { + ta.removeClass("shown"); ta.html(" Hide annotations"); $(".post-annotations").show(200); } else { - ta.addClass("hidden"); + ta.addClass("shown"); ta.html(" Show annotations"); $(".post-annotations").hide(200); } @@ -832,51 +839,29 @@ const page_functions = { })); // Reorder the dataset when the sort type is changed - $("#sort-select").on("change", function(){ + $(".sort-select").on("change", function(){ - let selected = $(this).find("option:selected"); - - // Pass whether the order should be reversed or not - let sort_order = selected.data("desc"); - if (sort_order){ - sort_order = "&desc=true" - } - else { - sort_order = "" - } + // Get the column to sort on, an whether we should sort in reverse. + let selected = $("#column-sort-select").find("option:selected").val(); + let order = $("#column-sort-order").find("option:selected").val(); - // Pass whether we should treat this value as an integer - let force_int = selected.data("force-int"); - if (force_int){ - force_int = "&int=true" - } - else { - force_int = "" + sort_order = "" + if (order == "reverse"){ + sort_order = "&order=reverse" } let dataset_key = $("#dataset-key").text(); - alert(dataset_key) - window.location.href = getRelativeURL("result/" + dataset_key + "/explorer/?sort=" + $(this).val() + sort_order + force_int); + window.location.href = getRelativeURL("results/" + dataset_key + "/explorer/?sort=" + selected + sort_order); }); // Change the dropdown sort option based on the URL parameter let searchParams = new URLSearchParams(window.location.search) - let sort_order = searchParams.get("sort"); - let desc = searchParams.get("desc"); - + let selected = searchParams.get("sort"); + let sort_order = searchParams.get("order"); + $("#column-sort-select").find("option[value='" + selected + "']").attr("selected", "selected"); if (sort_order) { - // There can be multiple options with the same key since - // one of them might be reversed and the other not (e.g. - // timestamps sorted by new to old and vice versa). - // So select the sort order with the right desc attribute. - if (desc == "true") { - $("#sort-select").find("option[value='" + sort_order + "'][data-desc='True']").attr("selected", "selected"); - } - else { - $("#sort-select").val(sort_order); - } + $("#column-sort-order").find("option[value='" + sort_order + "']").attr("selected", "selected"); } - } }; diff --git a/webtool/templates/explorer/annotations-editor.html b/webtool/templates/explorer/annotations-editor.html new file mode 100644 index 000000000..71c7dc5f3 --- /dev/null +++ b/webtool/templates/explorer/annotations-editor.html @@ -0,0 +1,55 @@ +
    + + +
    +
    +
    Label
    +
    Input type
    +
    Options
    +
    + +
    + {% if annotation_fields %} + + {% for field in annotation_fields %} + + {% set annotation_type = annotation_fields[field]["type"] %} + {% set label = annotation_fields[field]["label"] %} +
    + + + + + {% if annotation_type == "dropdown" or annotation_type == "checkbox" %} +
    + {% for option in annotation_fields[field]["options"] %} + {% set option_id = option.keys() | first %} + {% set option_label = option.values() | first %} +
    + + +
    + {% endfor %} +
    + + +
    +
    + {% endif %} +
    + {% endfor %} + {% endif %} +
    +
    +
    + + + +

    Note: Changing input types will overwrite existing annotations for the field

    +
    +
    \ No newline at end of file diff --git a/webtool/templates/explorer/annotations.html b/webtool/templates/explorer/annotations.html deleted file mode 100644 index fbb0b89bb..000000000 --- a/webtool/templates/explorer/annotations.html +++ /dev/null @@ -1,69 +0,0 @@ -
    -
    - -
    ×
    - -
    -
    -
    Label
    -
    Input type
    -
    Options
    -
    - -
    - {% if annotation_fields %} - - {% for field in annotation_fields %} - - {% set annotation_type = annotation_fields[field]["type"] %} - {% set label = annotation_fields[field]["label"] %} -
    - - - - - {% if annotation_type == "dropdown" or annotation_type == "checkbox" %} -
    - {% for option in annotation_fields[field]["options"] %} - {% set option_id = option.keys() | first %} - {% set option_label = option.values() | first %} -
    - - -
    - {% endfor %} -
    - - -
    -
    - {% endif %} -
    - {% endfor %} - {% endif %} -
    -
    -
    - - - -

    Note: Changing input types will overwrite existing annotations for the field

    -
    -
    -
    - -
    -
    - - - | - - -
    - -
    \ No newline at end of file diff --git a/webtool/templates/explorer/controls.html b/webtool/templates/explorer/controls.html index 3ab99c3b0..3dc77990d 100644 --- a/webtool/templates/explorer/controls.html +++ b/webtool/templates/explorer/controls.html @@ -5,43 +5,57 @@

    + {% if custom_fields and custom_fields[0] == "invalid" %}
    Invalid custom fields JSON - can't show posts properly ({{ custom_fields[1] }}).
    {% endif %} - {% if key %} -
    - {% if post_count > max_posts %} -

    Large dataset - only showing the first {{ max_posts }} posts. Use filter processors to limit the dataset.

    - {% set post_count = max_posts %} - {% endif %} -
    + {% if key and post_count > max_posts %} +
    Large dataset - only showing the first {{ max_posts }} posts. Use filter processors to limit the dataset.
    + {% set post_count = max_posts %} {% endif %} - {# some different info for views generated by a direct API call #} - {% if not key and has_database %} -

    Showing {{ post_count }} posts from {{ datasource }}.

    - {% else %} -

    Showing posts {{ offset + 1 }} - {{ post_count if (offset + posts_per_page) > post_count else (offset + posts_per_page) }} ({{ post_count }} in total).

    - {% endif %} - - -
    -

    Sort posts by: - -

    -
    +
    + {% if not key and has_database %} + Showing {{ post_count }} posts from {{ datasource }}. + {% else %} + Showing posts {{ offset + 1 }} - {{ post_count if (offset + posts_per_page) > post_count else (offset + posts_per_page) }} ({{ post_count }} in total). + {% endif %} +
    + +
    + +
    diff --git a/webtool/templates/explorer/explorer.html b/webtool/templates/explorer/explorer.html index 4ab1fd206..05fa4c0a5 100644 --- a/webtool/templates/explorer/explorer.html +++ b/webtool/templates/explorer/explorer.html @@ -23,17 +23,12 @@ {% endif %} -{{ key }} +{% set key = dataset.data.key %} {% include "explorer/controls.html" %} {% include "explorer/pagination.html" %} -{% if not thread %} - {% include "explorer/annotations.html" %} -{% endif %} - -
      {% for post in posts %} diff --git a/webtool/views/views_explorer.py b/webtool/views/views_explorer.py index 576c1e484..7e1b389a9 100644 --- a/webtool/views/views_explorer.py +++ b/webtool/views/views_explorer.py @@ -21,15 +21,15 @@ from webtool import app, db, openapi, limiter, config from webtool.lib.helpers import format_chan_post, error, setting_required from common.lib.dataset import DataSet -from common.lib.helpers import strip_tags +from common.lib.helpers import strip_tags, convert_to_float from common.lib.exceptions import DataSetException from common.config_manager import ConfigWrapper config = ConfigWrapper(config, user=current_user, request=request) api_ratelimit = limiter.shared_limit("45 per minute", scope="api") -@app.route('/result//explorer/', defaults={'page': 1}) -@app.route('/result//explorer/page/') +@app.route('/results//explorer/', defaults={'page': 1}) +@app.route('/results//explorer/page/') @api_ratelimit @login_required @setting_required("privileges.can_use_explorer") @@ -62,14 +62,14 @@ def explorer_dataset(key, page=1): return error(403, error="This dataset is private.") if len(dataset.get_genealogy()) > 1: - return error(404, error="Unavailable for top-level datasets") + return error(404, error="Only available for top-level datasets.") results_path = dataset.check_dataset_finished() if not results_path: - return error(404, error="This dataset didn't finish executing") + return error(404, error="This dataset didn't finish executing.") if datasource not in config.get("explorer.config") and not config["explorer.config"][datasource]["enabled"]: - return error(404, error="Explorer functionality disabled for %s" % datasource) + return error(404, error="Explorer functionality disabled for %s." % datasource) # The amount of posts to show on a page posts_per_page = config.get("explorer.posts_per_page", 50) @@ -90,10 +90,9 @@ def explorer_dataset(key, page=1): sort = None # Check if we have to reverse the order. - reverse = True if request.args.get("desc") in ("true", True) else False - - # Check if we have to convert the sort value to an integer. - force_number = True if request.args.get("int") in ("true", True) else False + reverse = True if request.args.get("order") == "reverse" else False + print(request.args.get("order")) + print(reverse) # Load posts post_ids = [] @@ -161,9 +160,9 @@ def explorer_dataset(key, page=1): annotations = json.loads(annotations["annotations"]) # Generate the HTML page - return render_template("explorer/explorer.html", key=key, datasource=datasource, has_database=has_database, parameters=parameters, annotation_fields=annotation_fields, annotations=annotations, dataset=dataset, posts=posts, custom_css=css, custom_fields=custom_fields, page=page, offset=offset, posts_per_page=posts_per_page, post_count=post_count, max_posts=max_posts) + return render_template("explorer/explorer.html", dataset=dataset, datasource=datasource, has_database=has_database, parameters=parameters, posts=posts, annotation_fields=annotation_fields, annotations=annotations, custom_css=css, custom_fields=custom_fields, page=page, offset=offset, posts_per_page=posts_per_page, post_count=post_count, max_posts=max_posts) -@app.route('/result///explorer') +@app.route('/results///explorer') @api_ratelimit @login_required @setting_required("privileges.can_use_explorer") @@ -519,7 +518,7 @@ def get_image_file(img_file): return send_file(str(image_path)) -def sort_and_iterate_items(dataset, sort=None, reverse=False, force_number=False, **kwargs): +def sort_and_iterate_items(dataset, sort=None, reverse=False, **kwargs): """ Loop through both csv and NDJSON files. This is basically a wrapper function for `iterate_items()` with the @@ -531,24 +530,22 @@ def sort_and_iterate_items(dataset, sort=None, reverse=False, force_number=False :param key, str: The dataset object. :param sort_by, str: The item key that determines the sort order. :param reverse, bool: Whether to sort by largest values first. - :param force_number, bool: Whether the sort value should be converted to a - floating point number. """ # Storing posts in the right order here - posts = [] - - # Generate reader on the basis of sort value - # At the moment, this is very inefficient, but - # suffices for the few cases where `sort` is used. - #sort_by_index = next(reader).index(sort) - #reader = sorted(reader, key=lambda x: convert_to_float(x[sort_by_index]) if len(x) >= sort_by_index else 0, reverse=True) - #sorted([json.loads(line) for line in infile], key=lambda x: convert_to_float(flatten_dict(x)[sort]), reverse=True) + sorted_posts = [] - for item in sorted(dataset.iterate_items(**kwargs), key=lambda x: x[sort]): - posts.append(item) - - for post in posts: + try: + for item in sorted(dataset.iterate_items(**kwargs), key=lambda x: x[sort], reverse=reverse): + sorted_posts.append(item) + except TypeError: + # Dataset fields can contain integers and empty strings. + # Since these cannot be compared, we will convert every + # empty string to 0. + for item in sorted(dataset.iterate_items(**kwargs), key=lambda x: convert_to_float(x[sort]), reverse=reverse): + sorted_posts.append(item) + + for post in sorted_posts: yield post def get_database_posts(db, datasource, ids, board="", threads=False, limit=0, offset=0, order_by=["timestamp"]): From c33fd721a7b5aa50b06803043de1dd0666edf238 Mon Sep 17 00:00:00 2001 From: sal-phd-desktop Date: Thu, 18 Apr 2024 19:55:30 +0200 Subject: [PATCH 018/204] Style and fix annotation field editor, enable config settings for CSS --- .../dmi-tcatv2/explorer/dmi-tcat-explorer.css | 84 --------- webtool/static/css/dataset-page.css | 22 +++ .../{explorer-default.css => default.css} | 0 .../static/css/explorer/dmi-tcat.css | 0 .../static/css/explorer/douyin.css | 0 .../static/css/explorer/fourchan.css | 0 .../static/css/explorer/instagram.css | 0 .../static/css/explorer/reddit.css | 0 .../{telegram-search.css => telegram.css} | 0 .../static/css/explorer/tiktok.css | 0 .../static/css/explorer/tiktok_urls.css | 0 .../static/css/explorer/tumblr.css | 0 .../static/css/explorer/twitter-import.css | 0 .../static/css/explorer/twitterv2.css | 0 webtool/static/css/stylesheet.css | 10 +- webtool/static/js/explorer.js | 165 +++++++++--------- .../components/result-result-row.html | 4 +- .../explorer/annotations-editor.html | 105 ++++++----- webtool/templates/explorer/controls.html | 118 +++++++------ webtool/templates/explorer/explorer.html | 21 ++- webtool/templates/explorer/pagination.html | 1 + webtool/templates/explorer/post.html | 14 +- webtool/views/views_explorer.py | 43 +---- 23 files changed, 264 insertions(+), 323 deletions(-) delete mode 100644 datasources/dmi-tcatv2/explorer/dmi-tcat-explorer.css rename webtool/static/css/explorer/{explorer-default.css => default.css} (100%) rename datasources/dmi-tcat/explorer/dmi-tcat-explorer.css => webtool/static/css/explorer/dmi-tcat.css (100%) rename datasources/douyin/explorer/douyin-explorer.css => webtool/static/css/explorer/douyin.css (100%) rename datasources/fourchan/explorer/fourchan-explorer.css => webtool/static/css/explorer/fourchan.css (100%) rename datasources/instagram/explorer/instagram-explorer.css => webtool/static/css/explorer/instagram.css (100%) rename datasources/reddit/explorer/reddit-explorer.css => webtool/static/css/explorer/reddit.css (100%) rename webtool/static/css/explorer/{telegram-search.css => telegram.css} (100%) rename datasources/tiktok/explorer/tiktok-explorer.css => webtool/static/css/explorer/tiktok.css (100%) rename datasources/tiktok_urls/explorer/tiktok_urls-explorer.css => webtool/static/css/explorer/tiktok_urls.css (100%) rename datasources/tumblr/explorer/tumblr-explorer.css => webtool/static/css/explorer/tumblr.css (100%) rename datasources/twitter-import/explorer/twitter-import-explorer.css => webtool/static/css/explorer/twitter-import.css (100%) rename datasources/twitterv2/explorer/twitterv2-explorer.css => webtool/static/css/explorer/twitterv2.css (100%) diff --git a/datasources/dmi-tcatv2/explorer/dmi-tcat-explorer.css b/datasources/dmi-tcatv2/explorer/dmi-tcat-explorer.css deleted file mode 100644 index 86bf76e27..000000000 --- a/datasources/dmi-tcatv2/explorer/dmi-tcat-explorer.css +++ /dev/null @@ -1,84 +0,0 @@ -/* - -See https://github.com/digitalmethodsinitiative/4cat/wiki/Exploring-and-annotating-datasets for information on how to add custom CSS. - - */ - -body { - background-color: white; -} - -.posts .post { - position: relative; - background-color: white; - max-width: 620px; - border: 1px solid #d6d6d6; - border-radius: 10px; - min-height: 50px; -} - -.posts .post header { - display: inline-block; - line-height: 1.7em; - margin-bottom: 5px; - border: none; - color: rgb(104, 119, 130); -} - -.posts .post header .post_id { - display: none; -} - -.posts .post header .author { - color: black; -} - -.posts .post header .profile_picture { - float: left; - margin-right: 15px; -} - -.posts .post header .profile_picture img { - border-radius: 100px; - width: 50px; -} - -.posts .post header .profile_picture:after { - display: none; -} - -.posts .post article { - margin: 0; - padding: 0; -} - -.posts .post.op { - background-color: white; - color: black; -} - -.posts .post .post-content { - display: inline-block; -} - -.posts .post .post-image { - margin-bottom: 10px; -} - -.posts .post .post-image img { - border-radius: 10px; -} - -.posts .external-url { - color: rgb(104, 119, 130); -} - -.posts .post.op .post-annotations, .posts .post .post-annotations { - border-radius: 10px; - background-color: rgb(241, 249, 255); - color: #474747; -} - -span.hashtag { - color: rgb(29, 155, 240); -} \ No newline at end of file diff --git a/webtool/static/css/dataset-page.css b/webtool/static/css/dataset-page.css index e82742bda..c9b04c605 100644 --- a/webtool/static/css/dataset-page.css +++ b/webtool/static/css/dataset-page.css @@ -694,4 +694,26 @@ body.image-preview { #image-zoom:checked + label img { max-height: none; cursor: zoom-out; +} + +/* Explorer view */ +#annotation-fields-editor-controls { + display: grid; + grid-template-columns: auto auto auto; +} + +#annotation-fields-editor-controls>div { + border-bottom: 1px solid var(--contrast-bright); +} + +#edit-annotation-fields #input-warning { + color: var(--accent-error); +} + +/* Remove all styles for explorer posts */ +/* these ought to be defined specifically */ +/* and 4CAT styles shouldn't interfere. */ +#explorer-posts, #explorer-posts > ol li { + all: initial; + padding: 0; } \ No newline at end of file diff --git a/webtool/static/css/explorer/explorer-default.css b/webtool/static/css/explorer/default.css similarity index 100% rename from webtool/static/css/explorer/explorer-default.css rename to webtool/static/css/explorer/default.css diff --git a/datasources/dmi-tcat/explorer/dmi-tcat-explorer.css b/webtool/static/css/explorer/dmi-tcat.css similarity index 100% rename from datasources/dmi-tcat/explorer/dmi-tcat-explorer.css rename to webtool/static/css/explorer/dmi-tcat.css diff --git a/datasources/douyin/explorer/douyin-explorer.css b/webtool/static/css/explorer/douyin.css similarity index 100% rename from datasources/douyin/explorer/douyin-explorer.css rename to webtool/static/css/explorer/douyin.css diff --git a/datasources/fourchan/explorer/fourchan-explorer.css b/webtool/static/css/explorer/fourchan.css similarity index 100% rename from datasources/fourchan/explorer/fourchan-explorer.css rename to webtool/static/css/explorer/fourchan.css diff --git a/datasources/instagram/explorer/instagram-explorer.css b/webtool/static/css/explorer/instagram.css similarity index 100% rename from datasources/instagram/explorer/instagram-explorer.css rename to webtool/static/css/explorer/instagram.css diff --git a/datasources/reddit/explorer/reddit-explorer.css b/webtool/static/css/explorer/reddit.css similarity index 100% rename from datasources/reddit/explorer/reddit-explorer.css rename to webtool/static/css/explorer/reddit.css diff --git a/webtool/static/css/explorer/telegram-search.css b/webtool/static/css/explorer/telegram.css similarity index 100% rename from webtool/static/css/explorer/telegram-search.css rename to webtool/static/css/explorer/telegram.css diff --git a/datasources/tiktok/explorer/tiktok-explorer.css b/webtool/static/css/explorer/tiktok.css similarity index 100% rename from datasources/tiktok/explorer/tiktok-explorer.css rename to webtool/static/css/explorer/tiktok.css diff --git a/datasources/tiktok_urls/explorer/tiktok_urls-explorer.css b/webtool/static/css/explorer/tiktok_urls.css similarity index 100% rename from datasources/tiktok_urls/explorer/tiktok_urls-explorer.css rename to webtool/static/css/explorer/tiktok_urls.css diff --git a/datasources/tumblr/explorer/tumblr-explorer.css b/webtool/static/css/explorer/tumblr.css similarity index 100% rename from datasources/tumblr/explorer/tumblr-explorer.css rename to webtool/static/css/explorer/tumblr.css diff --git a/datasources/twitter-import/explorer/twitter-import-explorer.css b/webtool/static/css/explorer/twitter-import.css similarity index 100% rename from datasources/twitter-import/explorer/twitter-import-explorer.css rename to webtool/static/css/explorer/twitter-import.css diff --git a/datasources/twitterv2/explorer/twitterv2-explorer.css b/webtool/static/css/explorer/twitterv2.css similarity index 100% rename from datasources/twitterv2/explorer/twitterv2-explorer.css rename to webtool/static/css/explorer/twitterv2.css diff --git a/webtool/static/css/stylesheet.css b/webtool/static/css/stylesheet.css index 0fd5d7733..8c928e78e 100644 --- a/webtool/static/css/stylesheet.css +++ b/webtool/static/css/stylesheet.css @@ -956,6 +956,11 @@ article section.data-overview .description { color: var(--contrast-bright); } +.pagination .details { + margin: 0 auto; + text-align: center; +} + .tabs { border-bottom: 1px dotted var(--contrast-dark); max-height: 5em; @@ -1206,8 +1211,3 @@ ol.result-list li.has_results .property-container.analysis a { padding: 0; margin: 0.5em 0 0 0; } - -#explorer-posts, #explorer-posts > ol li { - all: initial; - padding: 0; -} \ No newline at end of file diff --git a/webtool/static/js/explorer.js b/webtool/static/js/explorer.js index 8c45e9b7e..ac160e269 100644 --- a/webtool/static/js/explorer.js +++ b/webtool/static/js/explorer.js @@ -22,57 +22,33 @@ const annotations = { init: function() { - // Show and hide the annotation controls - $("#toggle-annotation-controls").on("click", function() { - $("#annotation-controls-buttons").toggleClass("hidden"); - if ($("#annotation-controls-buttons").hasClass("hidden")) { - $(this).html(""); - } - else { - $(this).html(""); - } - }); + let edit_field_box = $("#edit-annotation-fields"); + let editor = $("#annotation-fields-editor"); + let editor_controls = $("#annotation-fields-editor-controls"); // Add a new annotation field when clicking the plus icon - $("#add-annotation-field").on("click", function(){$("#annotation-fields").append(annotations.getAnnotationsDiv);}); + $("#new-annotation-field").on("click", function(){ + let annotations_div = annotations.getAnnotationsDiv(); + $(annotations_div).insertBefore(edit_field_box);}); // Show and hide the annotations editor $("#toggle-annotation-fields").on("click", function(){ - $("#annotations-editor-container").toggle(); - if ($("#annotation-controls-buttons").hasClass("hidden")) { - $(this).html(""); + editor.toggleClass("hidden"); + if (editor.hasClass("hidden")) { + $("#toggle-annotation-fields").html(" Edit fields"); } else { - $(this).html(""); + $("#toggle-annotation-fields").html(" Hide editor"); } }); - $("#annotations-editor").click(function(e) { - e.stopPropagation(); - }); // Keep track of when the annotation fields were edited. - $("#annotation-fields").on("click", "#add-annotation-field, .delete-input, .delete-input i, .delete-option-field, .delete-option-field i", function() { - $("#save-annotation-fields").removeClass("invalid").removeAttr("disabled"); - }); - $("#annotation-fields").on("change keydown", "input, select", function() { - $("#save-annotation-fields").removeClass("invalid").removeAttr("disabled"); - }); - - // Close the annotation field editor (ask whether unsaved changes can be discarded) - $("#annotations-editor-container, #close-annotation-fields").click(function(e){ - e.preventDefault(); - if (!$("#save-annotation-fields").prop("disabled")) { - let conf = confirm("Close without applying input fields?"); - if (conf) { - $("#annotations-editor-container").hide(); - $("#annotation-fields").html(old_annotation_fields); - $("#save-annotation-fields").addClass("invalid").prop("disabled", true); - } - } - else { - $("#annotations-editor-container").hide(); - } + editor_controls.on("click", "#apply-annotation-fields, .delete-input, .delete-option-field", function() { + $("#apply-annotation-fields").removeClass("disabled"); }); + editor_controls.on("change keydown", "input, select", function() { + $("#apply-annotation-fields").removeClass("disabled"); + }); // Show and hide annotations $("#toggle-annotations").on("click", function(){ @@ -82,26 +58,32 @@ const annotations = { }); // Delete an entire annotation input - $("#annotation-fields").on("click", ".annotation-field > .delete-input", function(e){$(this).parent().remove();}); + // We're in a grid of threes, so this involves three divs + editor_controls.on("click", ".annotation-field > .delete-input", function(e){ + let parent_div = $(this).parent().parent(); + parent_div.next().remove(); // Input type + parent_div.next().remove(); // Options + parent_div.remove(); // Label + }); // Make saving available when annotation fields are changed - $("#annotation-fields").on("click", ".annotation-field > .option-fields > .option-field > .delete-option-field", function() { + editor_controls.on("click", ".delete-option-field", function() { annotations.deleteOption(this); }); - $("#annotation-fields").on("change", ".annotation-field > .annotation-field-type", function(e) {annotations.toggleField(e.target);}); + editor_controls.on("change", ".annotation-field-type", function(e) {annotations.toggleField(e.target);}); - // Make enter add a new option field - $("#annotation-fields").on("keypress", "input", function(e){ + // Make enter apply the option fields + editor_controls.on("keypress", "input", function(e){ if (e.which == 13) { annotations.applyAnnotationFields(); } }); - // Save the annotations fields to the database - $("#save-annotation-fields").on("click", annotations.applyAnnotationFields); + // Save the annotation fields to the database + $("#apply-annotation-fields").on("click", annotations.applyAnnotationFields); // Dynamically add a new option field when another is edited - $("#annotation-fields").on("keyup", ".annotation-field > .option-fields > .option-field > input", function(e) { + editor_controls.on("keyup", ".option-field > input", function(e) { if ($(this).val().length > 0) { annotations.addOptions(e.target); } @@ -131,7 +113,7 @@ const annotations = { // Ask whether the next page should be opened without saving annotations $('a > .page').click(function(){ if (!$("#save-annotations").prop('disabled')) { - return confirm("You'll lose unsaved annotations for this page if you don't save first.\nDo you still want to continue?"); + return confirm("Unsaved annotations are lost if you don't save before leaving the page.\nLeave anyway?"); } }) @@ -148,18 +130,20 @@ const annotations = { }, toggleField: function (el) { - // Change the type of input fields when switching in the dropdown + let type = $(el).val(); let old_type = $(el).attr("data-val"); + let options = $(el).parent().parent().next(); + let option_fields = options.find(".option-field"); + if (type == "text" || type == "textarea") { - $(el).parent().find(".option-fields").remove(); + option_fields.remove(); } else if (type == "dropdown" || type == "checkbox") { - if (!($(el).siblings(".option-fields").length) > 0) { - $(el).after("
      "); - $(el).next().append(annotations.getInputField); + if (option_fields.length == 0) { + options.append(annotations.getInputField); } } }, @@ -171,7 +155,7 @@ const annotations = { // no empty fields available, add a new one. let no_empty_fields = true; let input_fields = $(el).parent().siblings(); - + console.log(input_fields) if (!$(el).val().length > 0) { no_empty_fields = false; } @@ -183,6 +167,7 @@ const annotations = { no_empty_fields = false; } }); + // Add a new field if there's no empty ones if (no_empty_fields) { $(el).parent().after(annotations.getInputField); } @@ -204,7 +189,7 @@ const annotations = { return false; } $(this).append(` - `); + `); }); } }, @@ -223,7 +208,7 @@ const annotations = { // Validates and converts the fields in the annotations editor. // Returns an object with the set annotation fields. - annotation_fields = {}; + var annotation_fields = {}; var warning = ""; var labels_added = [] @@ -233,9 +218,14 @@ const annotations = { // Parse information from the annotations editor. $(".annotation-field").each(function(){ - + // To align the input form, we're in a grid of threes: + // label, input type, options. + // Navigate the DOM to get these elements: let label_field = $(this).children(".annotation-field-label"); - let label = label_field.val().replace(/\s+/g, ' ');; + let type_field = $(this).parent().next(); + let options_field = $(this).parent().next().next(); + + let label = label_field.val().replace(/\s+/g, ' '); // Get the random identifier of the field, so we // can later check if it already exists. @@ -253,7 +243,7 @@ const annotations = { } // Set the types and values of the annotation - type = $(this).children(".annotation-field-type").val(); + type = type_field.find(".annotation-field-type").val(); // Keep track of the labels we've added labels_added.push(label) @@ -268,8 +258,8 @@ const annotations = { let no_options_added = true; let option_id = "" - $(this).find(".option-field > input").each(function(){ - + options_field.find(".option-field > input").each(function(){ + console.log(this) let option_label = $(this).val(); let option_id = this.id.replace("input-", ""); @@ -306,10 +296,10 @@ const annotations = { } }); + console.log(annotation_fields) if (warning.length > 0) { return warning; } - console.log(annotation_fields) return annotation_fields; }, @@ -320,9 +310,8 @@ const annotations = { var annotation_fields = annotations.parseAnnotationFields(e); var fields_to_add = {}; - // Show an error message if the annotation fields were not valid. - if (typeof annotation_fields == 'string') { + if (typeof annotation_fields == "string") { annotations.warnEditor(annotation_fields); return } @@ -331,11 +320,11 @@ const annotations = { // the annotation fields to each post on the page. else { - $("#save-annotation-fields").html(" Applying") + $("#apply-annotation-fields").html(" Applying") // Remove warnings annotations.warnEditor("") - $("#annotation-fields").find("input").each(function(){ + $("#annotation-field").find("input").each(function(){ $(this).removeClass("invalid"); }); $(".option-fields").find("input").each(function(){ @@ -579,7 +568,7 @@ const annotations = { } } - $("#save-annotation-fields").html(" Apply") + $("#apply-annotation-fields").html(" Apply") }, saveAnnotationFields: function (annotation_fields){ @@ -607,7 +596,7 @@ const annotations = { // If the query is accepted by the server. if (response == 'success') { $("#annotations-editor-container").hide(); - $("#save-annotation-fields").addClass("disabled"); + $("#apply-annotation-fields").addClass("disabled"); } // If the query is rejected by the server. @@ -703,7 +692,7 @@ const annotations = { annotations.enableSaving(); $("#save-annotations").html(" Annotations saved"); $("#save-annotations").addClass("disabled"); - old_annotation_fields = $("#annotation-fields").html(); + old_annotation_fields = $("#annotation-field").each(); // alert(alert_message); } else { @@ -775,7 +764,7 @@ const annotations = { warnEditor: function(warning) { - let warn_field = $("#annotations-input-warning"); + let warn_field = $("#input-warning"); warn_field.html(warning); if (warn_field.hasClass("hidden")) { warn_field.removeClass("hidden"); @@ -787,13 +776,13 @@ const annotations = { let ta = $("#toggle-annotations"); if (ta.hasClass("shown")) { ta.removeClass("shown"); - ta.html(" Hide annotations"); - $(".post-annotations").show(200); + ta.html(" Show annotations"); + $(".post-annotations").addClass("hidden"); } else { ta.addClass("shown"); - ta.html(" Show annotations"); - $(".post-annotations").hide(200); + ta.html(" Hide annotations"); + $(".post-annotations").removeClass("hidden"); } }, @@ -802,17 +791,25 @@ const annotations = { if (id == undefined || id == 0) { id = annotations.randomInt(); } + // Returns an annotation div element with a pseudo-random ID - return `
      - - - -
      `.replace("{{FIELD_ID}}", id); + return `
      +
      + + +
      +
      +
      +
      + +
      +
      +
      `.replace("{{FIELD_ID}}", id); }, getInputField: function(id){ diff --git a/webtool/templates/components/result-result-row.html b/webtool/templates/components/result-result-row.html index 7f71f0ef4..2d7972398 100644 --- a/webtool/templates/components/result-result-row.html +++ b/webtool/templates/components/result-result-row.html @@ -44,14 +44,14 @@ {% endif %} -
    1. {% if __user_config("privileges.can_use_explorer") and has_explorer %} +
    2. Explorer - {% endif %}
    3. + {% endif %} {% endif %}
    \ No newline at end of file diff --git a/webtool/templates/explorer/annotations-editor.html b/webtool/templates/explorer/annotations-editor.html index 71c7dc5f3..f16c7c325 100644 --- a/webtool/templates/explorer/annotations-editor.html +++ b/webtool/templates/explorer/annotations-editor.html @@ -1,55 +1,80 @@ -
    - + + + +
    +
    +
    + Label + + +
    +
    +
    +
    + Input type + + +
    +
    +
    +
    + Options + + +
    +
    -
    -
    -
    Label
    -
    Input type
    -
    Options
    -
    + {% if annotation_fields %} -
    - {% if annotation_fields %} - - {% for field in annotation_fields %} + {% for field in annotation_fields %} + {% set annotation_type = annotation_fields[field]["type"] %} + {% set label = annotation_fields[field]["label"] %} - {% set annotation_type = annotation_fields[field]["type"] %} - {% set label = annotation_fields[field]["label"] %} -
    +
    +
    - + +
    +
    +
    +
    +
    +
    - {% if annotation_type == "dropdown" or annotation_type == "checkbox" %} -
    + {% if annotation_type == "dropdown" or annotation_type == "checkbox" %} +
    +
    {% for option in annotation_fields[field]["options"] %} - {% set option_id = option.keys() | first %} - {% set option_label = option.values() | first %} -
    - - -
    + {% set option_id = option.keys() | first %} + {% set option_label = option.values() | first %} +
    + + +
    {% endfor %} -
    - - -
    +
    +
    - {% endif %} -
    - {% endfor %} - {% endif %} -
    +
    -
    - - - -

    Note: Changing input types will overwrite existing annotations for the field

    + {% else %} +
    + {% endif %} + {% endfor %} + {% endif %} +
    +
    + New field + Apply +
    +
    \ No newline at end of file diff --git a/webtool/templates/explorer/controls.html b/webtool/templates/explorer/controls.html index 3dc77990d..bf1921896 100644 --- a/webtool/templates/explorer/controls.html +++ b/webtool/templates/explorer/controls.html @@ -1,61 +1,67 @@ -
    -

    - {{ dataset.get_label() }} -

    - - - {% if custom_fields and custom_fields[0] == "invalid" %} -
    - Invalid custom fields JSON - can't show posts properly ({{ custom_fields[1] }}). -
    - {% endif %} - {% if key and post_count > max_posts %} -
    Large dataset - only showing the first {{ max_posts }} posts. Use filter processors to limit the dataset.
    - {% set post_count = max_posts %} - {% endif %} - -
    - {% if not key and has_database %} - Showing {{ post_count }} posts from {{ datasource }}. - {% else %} - Showing posts {{ offset + 1 }} - {{ post_count if (offset + posts_per_page) > post_count else (offset + posts_per_page) }} ({{ post_count }} in total). +
    +
    +

    + {{ dataset.get_label() }} - Explorer +

    + + {% if custom_fields and custom_fields[0] == "invalid" %} +
    + Invalid custom fields JSON - can't show posts properly ({{ custom_fields[1] }}). +
    {% endif %} -
    - -
    - +
    +
    Now showing
    +
    + {% if not key and has_database %} + {{ post_count }} posts from {{ datasource }}. + {% else %} + Posts {{ offset + 1 }}—{{ post_count if (offset + posts_per_page) > post_count else (offset + posts_per_page) }} ({{ post_count }} total). + {% endif %} +
    +
    +
    -
    diff --git a/webtool/templates/explorer/explorer.html b/webtool/templates/explorer/explorer.html index 05fa4c0a5..d2d6e3b9a 100644 --- a/webtool/templates/explorer/explorer.html +++ b/webtool/templates/explorer/explorer.html @@ -6,8 +6,6 @@ {% block body %} - - @@ -16,12 +14,6 @@ var annotation_fields = {% if annotation_fields %}{{ annotation_fields | safe }}{% else %}""{% endif %} - -{% if custom_css %} - -{% endif %} {% set key = dataset.data.key %} @@ -29,6 +21,19 @@ {% include "explorer/pagination.html" %} + +{% if "css" in datasource_config %} + {% if datasource_config.css == "preset" %} + + {% elif datasource_config.css == "custom" %} + + {% else %} + + {% endif %} +{% endif %} + + +
      {% for post in posts %} diff --git a/webtool/templates/explorer/pagination.html b/webtool/templates/explorer/pagination.html index ebf522890..2161f22bd 100644 --- a/webtool/templates/explorer/pagination.html +++ b/webtool/templates/explorer/pagination.html @@ -1,4 +1,5 @@
    From 4a92b1488c324674d922b8836bdc968532399307 Mon Sep 17 00:00:00 2001 From: Sal Hagen Date: Thu, 4 Jul 2024 16:03:21 +0200 Subject: [PATCH 071/204] Remove unused UserInput --- common/lib/user_input.py | 1 - 1 file changed, 1 deletion(-) diff --git a/common/lib/user_input.py b/common/lib/user_input.py index 4de5478c2..dc7a9f547 100644 --- a/common/lib/user_input.py +++ b/common/lib/user_input.py @@ -36,7 +36,6 @@ class UserInput: OPTION_HUE = "hue" # colour hue OPTION_DATASOURCES = "datasources" # data source toggling OPTION_DATASOURCES_TABLE = "datasources_table" # a table with settings per data source - OPTION_DATASOURCES_TEXT = "datasources_text" # text input per data source (via dropdown) OPTIONS_COSMETIC = (OPTION_INFO, OPTION_DIVIDER) From c4b19434af52ecc2a57f7c8d609aa080ae5cdc15 Mon Sep 17 00:00:00 2001 From: Sal Hagen Date: Thu, 4 Jul 2024 16:04:56 +0200 Subject: [PATCH 072/204] Remove unnecessary UserInput imports --- datasources/instagram/search_instagram.py | 1 - datasources/linkedin/search_linkedin.py | 1 - datasources/parler/search_parler.py | 1 - datasources/tiktok/search_tiktok.py | 1 - datasources/twitter-import/search_twitter.py | 1 - 5 files changed, 5 deletions(-) diff --git a/datasources/instagram/search_instagram.py b/datasources/instagram/search_instagram.py index 642593220..4c096acd8 100644 --- a/datasources/instagram/search_instagram.py +++ b/datasources/instagram/search_instagram.py @@ -10,7 +10,6 @@ from backend.lib.search import Search from common.lib.item_mapping import MappedItem, MissingMappedField from common.lib.exceptions import WorkerInterruptedException, MapItemException -from common.lib.helpers import UserInput class SearchInstagram(Search): diff --git a/datasources/linkedin/search_linkedin.py b/datasources/linkedin/search_linkedin.py index 53d61a707..cddd27663 100644 --- a/datasources/linkedin/search_linkedin.py +++ b/datasources/linkedin/search_linkedin.py @@ -11,7 +11,6 @@ from backend.lib.search import Search from common.lib.item_mapping import MappedItem -from common.lib.helpers import UserInput class SearchLinkedIn(Search): """ diff --git a/datasources/parler/search_parler.py b/datasources/parler/search_parler.py index 07b6116ce..8ccc7ccd8 100644 --- a/datasources/parler/search_parler.py +++ b/datasources/parler/search_parler.py @@ -10,7 +10,6 @@ from backend.lib.search import Search from common.lib.item_mapping import MappedItem -from common.lib.helpers import UserInput class SearchParler(Search): diff --git a/datasources/tiktok/search_tiktok.py b/datasources/tiktok/search_tiktok.py index b3214bc42..90f443b49 100644 --- a/datasources/tiktok/search_tiktok.py +++ b/datasources/tiktok/search_tiktok.py @@ -9,7 +9,6 @@ from backend.lib.search import Search from common.lib.item_mapping import MappedItem -from common.lib.helpers import UserInput class SearchTikTok(Search): diff --git a/datasources/twitter-import/search_twitter.py b/datasources/twitter-import/search_twitter.py index 51df5ef51..274045fb3 100644 --- a/datasources/twitter-import/search_twitter.py +++ b/datasources/twitter-import/search_twitter.py @@ -9,7 +9,6 @@ from backend.lib.search import Search from common.lib.helpers import strip_tags from common.lib.item_mapping import MappedItem -from common.lib.helpers import UserInput class SearchTwitterViaZeeschuimer(Search): From fcb747301809ed96f0c97491b4e2dd64fe37f295 Mon Sep 17 00:00:00 2001 From: Sal Hagen Date: Thu, 4 Jul 2024 17:56:07 +0200 Subject: [PATCH 073/204] Use dictionary order as sort order for config settings --- common/lib/config_definition.py | 25 ++++++++++++++----------- webtool/views/views_admin.py | 15 +++++++++++++-- webtool/views/views_explorer.py | 4 ++-- 3 files changed, 29 insertions(+), 15 deletions(-) diff --git a/common/lib/config_definition.py b/common/lib/config_definition.py index 09d82f7ef..e1486a28b 100644 --- a/common/lib/config_definition.py +++ b/common/lib/config_definition.py @@ -4,18 +4,27 @@ Possible options and their default values. Options are actually set in 4CAT"s Database. Additional options can be defined in Data sources or Processors as `config` objects. + +The order of th dictionary below determines the order of the settings in the interface. + """ from common.lib.user_input import UserInput import json config_definition = { - "datasources._intro": { + "datasources.intro": { "type": UserInput.OPTION_INFO, "help": "Data sources enabled below will be offered to people on the 'Create Dataset' page. Additionally, " "people can upload datasets for these by for example exporting them with " "[Zeeschuimer](https://github.com/digitalmethodsinitiative/zeeschuimer) to this 4CAT instance.\n\n" "Some data sources offer further settings which may be configured on other tabs." }, + "datasources.intro2": { + "type": UserInput.OPTION_INFO, + "help": "*Warning:* changes take effect immediately. Datasets that would have expired under the new settings " + "will be deleted. You can use the 'Dataset bulk management' module in the control panel to manage the " + "expiration status of existing datasets." + }, "datasources.enabled": { "type": UserInput.OPTION_DATASOURCES, "default": ["ninegag", "douban", "douyin", "imgur", "upload", "instagram", "linkedin", "parler", @@ -23,12 +32,6 @@ "help": "Data Sources", "tooltip": "A list of enabled data sources that people can choose from when creating a dataset page." }, - "datasources._intro2": { - "type": UserInput.OPTION_INFO, - "help": "*Warning:* changes take effect immediately. Datasets that would have expired under the new settings " - "will be deleted. You can use the 'Dataset bulk management' module in the control panel to manage the " - "expiration status of existing datasets." - }, "datasources.expiration": { "type": UserInput.OPTION_TEXT_JSON, "default": {"fourchan": {"enabled": False, "allow_optout": False, "timeout": 0}, "eightchan": {"enabled": False, "allow_optout": False, "timeout": 0}, "eightkun": {"enabled": False, "allow_optout": False, "timeout": 0}, "ninegag": {"enabled": True, "allow_optout": False, "timeout": 0}, "bitchute": {"enabled": True, "allow_optout": False, "timeout": 0}, "dmi-tcat": {"enabled": False, "allow_optout": False, "timeout": 0}, "dmi-tcatv2": {"enabled": False, "allow_optout": False, "timeout": 0}, "douban": {"enabled": True, "allow_optout": False, "timeout": 0}, "douyin": {"enabled": True, "allow_optout": False, "timeout": 0}, "gab": {"enabled": True, "allow_optout": False, "timeout": 0}, "imgur": {"enabled": True, "allow_optout": False, "timeout": 0}, "upload": {"enabled": True, "allow_optout": False, "timeout": 0}, "instagram": {"enabled": True, "allow_optout": False, "timeout": 0}, "linkedin": {"enabled": True, "allow_optout": False, "timeout": 0}, "parler": {"enabled": True, "allow_optout": False, "timeout": 0}, "reddit": {"enabled": False, "allow_optout": False, "timeout": 0}, "telegram": {"enabled": True, "allow_optout": False, "timeout": 0}, "tiktok": {"enabled": True, "allow_optout": False, "timeout": 0}, "tiktok-urls": {"enabled": False, "allow_optout": False, "timeout": 0}, "truthsocial": {"enabled": True, "allow_optout": False, "timeout": 0}, "tumblr": {"enabled": False, "allow_optout": False, "timeout": 0}, "twitter": {"enabled": True, "allow_optout": False, "timeout": 0}, "twitterv2": {"enabled": False, "allow_optout": False, "timeout": 0}, "usenet": {"enabled": False, "allow_optout": False, "timeout": 0}, "vk": {"enabled": False, "allow_optout": False, "timeout": 0}}, @@ -305,13 +308,13 @@ "global": True }, # Explorer settings - "explorer.__basic-explanation": { + "explorer.basic-explanation": { "type": UserInput.OPTION_INFO, "help": "4CAT's Explorer feature lets you navigate and annotate datasets as if they " "appared on their original platform. This is intended to facilitate qualitative " "exploration and manual coding." }, - "explorer.__max_posts": { + "explorer.max_posts": { "type": UserInput.OPTION_TEXT, "default": 100000, "help": "Amount of posts", @@ -319,14 +322,14 @@ "tooltip": "Maximum number of posts to be considered by the Explorer (prevents timeouts and " "memory errors)" }, - "explorer.__posts_per_page": { + "explorer.posts_per_page": { "type": UserInput.OPTION_TEXT, "default": 50, "help": "Posts per page", "coerce_type": int, "tooltip": "Number of posts to display per page" }, - "explorer._config_explanation": { + "explorer.config_explanation": { "type": UserInput.OPTION_INFO, "help": "Per data source, you can enable or disable the Explorer. Posts will be formatted through a generic template " "made of [this HTML file](https://github.com/digitalmethodsinitiative/4cat/tree/master/webtool/templates/explorer/" diff --git a/webtool/views/views_admin.py b/webtool/views/views_admin.py index d982f1042..fcd0c2e98 100644 --- a/webtool/views/views_admin.py +++ b/webtool/views/views_admin.py @@ -571,10 +571,12 @@ def manipulate_settings(): flash("Invalid settings: %s" % str(e)) all_settings = config.get_all(user=None, tags=[tag]) + options = {} changed_categories = set() - for option in sorted({*all_settings.keys(), *definition.keys()}): + + for option in {*all_settings.keys(), *definition.keys()}: tag_value = all_settings.get(option, definition.get(option, {}).get("default")) global_value = global_settings.get(option, definition.get(option, {}).get("default")) is_changed = tag and global_value != tag_value @@ -616,7 +618,16 @@ def manipulate_settings(): changed_categories.add(option.split(".")[0]) tab = "" if not request.form.get("current-tab") else request.form.get("current-tab") - options = {k: options[k] for k in sorted(options, key=lambda o: options[o]["tabname"])} + + # We are ordering the options based on how they are ordered in their dictionaries, + # and not the database order. To do so, we're adding a simple config order number + # and sort on this. + config_order = 0 + for k, v in definition.items(): + options[k]["config_order"] = config_order + config_order += 1 + + options = {k: options[k] for k in sorted(options, key=lambda o: (options[o]["tabname"], options[o].get("config_order", 0)))} # 'data sources' is one setting but we want to be able to indicate # overrides per sub-item diff --git a/webtool/views/views_explorer.py b/webtool/views/views_explorer.py index 9ed869c0d..61fb467e6 100644 --- a/webtool/views/views_explorer.py +++ b/webtool/views/views_explorer.py @@ -66,10 +66,10 @@ def explorer_dataset(key, page=1): return error(404, error="Explorer functionality disabled for %s." % datasource) # The amount of posts to show on a page - posts_per_page = config.get("explorer.__posts_per_page", 50) + posts_per_page = config.get("explorer.posts_per_page", 50) # The amount of posts that may be included (limit for large datasets) - max_posts = config.get('explorer.__max_posts', 500000) + max_posts = config.get('explorer.max_posts', 500000) # The offset for posts depending on the current page offset = ((page - 1) * posts_per_page) if page else 0 From e672933ad53f5cc0f34bca937bb640e4115118f9 Mon Sep 17 00:00:00 2001 From: sal-phd-desktop Date: Tue, 9 Jul 2024 12:36:38 +0200 Subject: [PATCH 074/204] Change name of "Explore" button to "Explore & annotate" --- webtool/templates/components/result-result-row.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webtool/templates/components/result-result-row.html b/webtool/templates/components/result-result-row.html index 253e83c34..bec6e0d49 100644 --- a/webtool/templates/components/result-result-row.html +++ b/webtool/templates/components/result-result-row.html @@ -47,7 +47,7 @@ {% if __user_config("privileges.can_use_explorer") and has_explorer %}
  • - Explorer + Explore & annotate
  • From 0d2eef2d034969151efd8739d8b0413dc31b64ea Mon Sep 17 00:00:00 2001 From: sal-phd-desktop Date: Tue, 9 Jul 2024 12:52:38 +0200 Subject: [PATCH 075/204] Space out Twitter metrics better --- webtool/static/css/explorer/twitter.css | 18 ++++-------------- .../explorer/datasource-templates/twitter.html | 8 ++++---- 2 files changed, 8 insertions(+), 18 deletions(-) diff --git a/webtool/static/css/explorer/twitter.css b/webtool/static/css/explorer/twitter.css index af9fa1c1a..debdceb02 100644 --- a/webtool/static/css/explorer/twitter.css +++ b/webtool/static/css/explorer/twitter.css @@ -70,23 +70,13 @@ margin-top: 20px; } -.time, .metrics, .atname { +.time, .metrics, .atname, .external-url a { color: #7a8a97; } -.posts .post .metrics span { - margin-right: 60px; -} - -.verified { - color: rgb(29, 155, 240) -} - -.posts .external-url { - position: absolute; - bottom: 10px; - right: 10px; - color: rgb(104, 119, 130); +.posts .post .metrics { + display: flex; + justify-content: space-between; } span.hashtag { diff --git a/webtool/templates/explorer/datasource-templates/twitter.html b/webtool/templates/explorer/datasource-templates/twitter.html index 12fbf9c5c..fa238674a 100644 --- a/webtool/templates/explorer/datasource-templates/twitter.html +++ b/webtool/templates/explorer/datasource-templates/twitter.html @@ -72,11 +72,11 @@ {{ post.get("retweet_count") }} {% if post.get("impression_count") %} {{ post.get("impression_count") | numberify }}{% endif %} + + {% if not pseudonymised %} + + {% endif %}
    - - {% if not pseudonymised %} - - {% endif %}
    \ No newline at end of file From 7389e9b19d2ca1edf06e5854bc9c5069699d27e5 Mon Sep 17 00:00:00 2001 From: sal-phd-desktop Date: Tue, 9 Jul 2024 13:29:57 +0200 Subject: [PATCH 076/204] Include index in Explorer posts loop --- webtool/templates/explorer/explorer.html | 1 + 1 file changed, 1 insertion(+) diff --git a/webtool/templates/explorer/explorer.html b/webtool/templates/explorer/explorer.html index 7e9ae2e14..92fb27298 100644 --- a/webtool/templates/explorer/explorer.html +++ b/webtool/templates/explorer/explorer.html @@ -53,6 +53,7 @@
      {% for post in posts %} + {% set post_count = loop.index %} {% include "explorer/post.html" %} {% endfor %}
    From b8e1267c3d19232ff013765c7f12daded3282660 Mon Sep 17 00:00:00 2001 From: sal-phd-desktop Date: Tue, 9 Jul 2024 17:58:18 +0200 Subject: [PATCH 077/204] Telegram Explorer template v0.5 --- webtool/static/css/explorer/telegram.css | 73 ++++++++++++++++++ .../components/result-result-row.html | 2 +- .../datasource-templates/telegram.html | 74 +++++++++++++++++++ webtool/templates/explorer/explorer.html | 2 +- 4 files changed, 149 insertions(+), 2 deletions(-) create mode 100644 webtool/static/css/explorer/telegram.css create mode 100644 webtool/templates/explorer/datasource-templates/telegram.html diff --git a/webtool/static/css/explorer/telegram.css b/webtool/static/css/explorer/telegram.css new file mode 100644 index 000000000..5b3a6c0a1 --- /dev/null +++ b/webtool/static/css/explorer/telegram.css @@ -0,0 +1,73 @@ +@font-face { + font-family: 'Open Sans'; + src: url("../fonts/OpenSans-Regular.ttf") +} + +@font-face { + font-family: 'Open Sans'; + font-weight: bold; + src: url("../fonts/OpenSans-Bold.ttf") +} + +@font-face { + font-family: 'Open Sans'; + font-style: italic; + src: url("../fonts/OpenSans-Italic.ttf") +} + +@font-face { + font-family: 'Open Sans'; + font-weight: bold; + font-style: italic; + src: url("../fonts/OpenSans-BoldItalic.ttf") +} + +* { + font-family: "Open Sans", Arial; + font-size: 16px; + line-height: 1.5; +} + +.explorer-content ol li { + background-color: #6fa788; + padding: 1px; +} + +.posts .post-content { + list-style-type: none; + background-color: white; + width: 450px; + margin: 0 auto; + margin-top: 2px; + margin-bottom: 2px; + border-radius: 5px 15px 15px 5px; + padding: 12px 17px; +} + +.post-content.new-group { + border-radius: 0 10px 10px 10px; + margin-top: 10px; +} + +.author { + color: #2984cd; + font-weight: bold; +} + +.media-container img { + width: 100%; +} + +.day { + margin: 8px; + text-align: center; + color: white; +} + +.day span { + padding: 5px; + padding-left: 10px; + padding-right: 10px; + background-color: rgba(0,0,0,.3); + border-radius: 20px; +} \ No newline at end of file diff --git a/webtool/templates/components/result-result-row.html b/webtool/templates/components/result-result-row.html index bec6e0d49..06bd59290 100644 --- a/webtool/templates/components/result-result-row.html +++ b/webtool/templates/components/result-result-row.html @@ -49,7 +49,7 @@ Explore & annotate - + {% endif %} diff --git a/webtool/templates/explorer/datasource-templates/telegram.html b/webtool/templates/explorer/datasource-templates/telegram.html new file mode 100644 index 000000000..417a5ddbc --- /dev/null +++ b/webtool/templates/explorer/datasource-templates/telegram.html @@ -0,0 +1,74 @@ + + + {% set day = post.unix_timestamp | datetime(fmt="%d %B", wrap=False) %} + {% set prev_post = posts[post_index - 1] if post_index > 0 else {} %} + {% set new_day = day if not prev_post or prev_post.get("unix_timestamp", 0) | datetime(fmt="%d %B", wrap=False) != day else False %} + {% set new_author = True if not prev_post or prev_post.author != post.author else False %} + + {% if new_day %} +
    + {{ new_day }} +
    + {% endif %} + +
    +
    + {% if new_author or new_day %} + {% set author = post.author_username if not post.author_name else post.author_name %} + {% if post.author_name %} +
    + {% if not pseudonymised %} + + {% for name in author_name.split() %} + {{ name[0] }} + {% endfor %} + {% else %} + + {% endif %} +
    + +
    + + + + + + + + +
    + {% endif %} +
    +
    + {% if not pseudonymised %} + {{ author }} + {% else %} + + {% endif %} +
    + {% else %} +
    + {% endif %} + {% if post.attachment_type %} +
    + + + +
    + {% endif %} +
    + {{ post.body }} +
    + + +
    + {{ post.unix_timestamp | datetime(fmt="%H:%M", wrap=False) }} UTC +
    + + {% if not pseudonymised %} + + {% endif %} +
    +
    +
    \ No newline at end of file diff --git a/webtool/templates/explorer/explorer.html b/webtool/templates/explorer/explorer.html index 92fb27298..137864b5e 100644 --- a/webtool/templates/explorer/explorer.html +++ b/webtool/templates/explorer/explorer.html @@ -53,7 +53,7 @@
      {% for post in posts %} - {% set post_count = loop.index %} + {% set post_index = loop.index - 1 %} {% include "explorer/post.html" %} {% endfor %}
    From f3f6f41509da5165767577c1a4f44beeb32ed064 Mon Sep 17 00:00:00 2001 From: Sal Hagen Date: Wed, 10 Jul 2024 12:14:40 +0200 Subject: [PATCH 078/204] Add a string character counter template that also handles graphemes --- webtool/lib/template_filters.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/webtool/lib/template_filters.py b/webtool/lib/template_filters.py index 108241d93..91b72a3f6 100644 --- a/webtool/lib/template_filters.py +++ b/webtool/lib/template_filters.py @@ -6,6 +6,7 @@ import os import re import requests +import regex from urllib.parse import urlencode, urlparse from webtool import app, config @@ -236,6 +237,25 @@ def _jinja2_filter_social_mediafy(body, datasource=""): return body +@app.template_filter('string_counter') +def _jinja2_filter_string_counter(string, is_emoji=False): + # Returns a dictionary with counts of characters in a string. + # Also handles emojis. + + # We need to convert multi-character emojis ("graphemes") to one character. + if is_emoji == True: + string = regex.finditer(r"\X", string) # \X matches graphemes + string = [m.group(0) for m in string] + + # Count 'em + counter = {} + for s in string: + if s not in counter: + counter[s] = 0 + counter[s] += 1 + + return counter + @app.template_filter('parameter_str') def _jinja2_filter_parameter_str(url): # Returns the current URL parameters as a valid string. From e904351e6fee40ad6c7eb45017ca5862d5e02872 Mon Sep 17 00:00:00 2001 From: Sal Hagen Date: Wed, 10 Jul 2024 14:15:06 +0200 Subject: [PATCH 079/204] Fix incorrect emoji handling with resolved references in Telegram --- datasources/telegram/search_telegram.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/datasources/telegram/search_telegram.py b/datasources/telegram/search_telegram.py index 477cd9999..29e1b5195 100644 --- a/datasources/telegram/search_telegram.py +++ b/datasources/telegram/search_telegram.py @@ -641,7 +641,10 @@ def map_item(message): if message.get("reactions") and message["reactions"].get("results"): for reaction in message["reactions"]["results"]: - reactions += reaction["reaction"] * reaction["count"] + reaction_type = reaction["reaction"] + if isinstance(reaction_type, dict): + reaction_type = reaction_type["emoticon"] + reactions += reaction_type * reaction["count"] return MappedItem({ "id": f"{message['_chat']['username']}-{message['id']}", From 9fcd9aaaa21fe696672727c60bd2984e267a735e Mon Sep 17 00:00:00 2001 From: Sal Hagen Date: Wed, 10 Jul 2024 17:18:49 +0200 Subject: [PATCH 080/204] Get markdown text from telegram messages --- datasources/telegram/search_telegram.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/datasources/telegram/search_telegram.py b/datasources/telegram/search_telegram.py index 29e1b5195..89c5e321a 100644 --- a/datasources/telegram/search_telegram.py +++ b/datasources/telegram/search_telegram.py @@ -327,8 +327,10 @@ async def gather_posts(self, client, queries, max_items, min_date, max_date): i = 0 try: entity_posts = 0 + async for message in client.iter_messages(entity=query, offset_date=max_date): entity_posts += 1 + i += 1 if self.interrupted: raise ProcessorInterruptedException( @@ -346,6 +348,7 @@ async def gather_posts(self, client, queries, max_items, min_date, max_date): # the channel a message was forwarded from (but that # needs extra API requests...) serialized_message = SearchTelegram.serialize_obj(message) + if resolve_refs: serialized_message = await self.resolve_groups(client, serialized_message) @@ -646,6 +649,12 @@ def map_item(message): reaction_type = reaction_type["emoticon"] reactions += reaction_type * reaction["count"] + is_reply = False + reply_to = "" + if message.get("reply_to"): + is_reply = True + reply_to = message["reply_to"].get("reply_to_msg_id", "") + return MappedItem({ "id": f"{message['_chat']['username']}-{message['id']}", "thread_id": thread, @@ -655,7 +664,8 @@ def map_item(message): "author_name": fullname, "author_is_bot": "yes" if user_is_bot else "no", "body": message["message"], - "reply_to": message.get("reply_to_msg_id", ""), + "is_reply": is_reply, + "reply_to": reply_to, "views": message["views"] if message["views"] else "", "forwards": message.get("forwards", MissingMappedField(0)), "reactions": reactions, @@ -728,6 +738,7 @@ def serialize_obj(input_obj): obj = input_obj.copy() mapped_obj = {} + for item, value in obj.items(): if type(value) is datetime: mapped_obj[item] = value.timestamp() @@ -746,6 +757,11 @@ def serialize_obj(input_obj): # Add the _type if the original object was a telethon type if type(input_obj).__module__ in ("telethon.tl.types", "telethon.tl.custom.forward"): mapped_obj["_type"] = type(input_obj).__name__ + + # Store the markdown-formatted text + if hasattr(input_obj, "text"): + mapped_obj["message"] = input_obj.text + return mapped_obj @staticmethod From c0c7bfa949fa474085c0fd50b2775f0b1d110b64 Mon Sep 17 00:00:00 2001 From: Sal Hagen Date: Wed, 10 Jul 2024 17:33:32 +0200 Subject: [PATCH 081/204] ..but then a bit more elegant and also for resolved messages --- datasources/telegram/search_telegram.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datasources/telegram/search_telegram.py b/datasources/telegram/search_telegram.py index 89c5e321a..4c3ffcfc7 100644 --- a/datasources/telegram/search_telegram.py +++ b/datasources/telegram/search_telegram.py @@ -348,7 +348,7 @@ async def gather_posts(self, client, queries, max_items, min_date, max_date): # the channel a message was forwarded from (but that # needs extra API requests...) serialized_message = SearchTelegram.serialize_obj(message) - + if resolve_refs: serialized_message = await self.resolve_groups(client, serialized_message) @@ -759,7 +759,7 @@ def serialize_obj(input_obj): mapped_obj["_type"] = type(input_obj).__name__ # Store the markdown-formatted text - if hasattr(input_obj, "text"): + if type(input_obj).__name__ == "Message": mapped_obj["message"] = input_obj.text return mapped_obj From 92ac7c4c01499dee177702275a2ea9a9dc88961f Mon Sep 17 00:00:00 2001 From: Sal Hagen Date: Wed, 10 Jul 2024 17:33:58 +0200 Subject: [PATCH 082/204] styling --- webtool/templates/explorer/datasource-templates/tiktok.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webtool/templates/explorer/datasource-templates/tiktok.html b/webtool/templates/explorer/datasource-templates/tiktok.html index 882d87ac7..e89ef13bc 100644 --- a/webtool/templates/explorer/datasource-templates/tiktok.html +++ b/webtool/templates/explorer/datasource-templates/tiktok.html @@ -28,7 +28,7 @@ - {{ post.body | social_mediafy(datasource='tiktok') | safe }} + {{ post.body | social_mediafy(datasource="tiktok") | safe }}
    From d4256aeca852467239e22a40b0a8ba7a3b0c3a2a Mon Sep 17 00:00:00 2001 From: Sal Hagen Date: Wed, 10 Jul 2024 17:34:12 +0200 Subject: [PATCH 083/204] Telegram template v1.0 --- webtool/lib/template_filters.py | 42 ++-- webtool/static/css/explorer/telegram.css | 214 +++++++++++++++--- .../datasource-templates/telegram.html | 91 +++++--- 3 files changed, 263 insertions(+), 84 deletions(-) diff --git a/webtool/lib/template_filters.py b/webtool/lib/template_filters.py index 91b72a3f6..d5d12385d 100644 --- a/webtool/lib/template_filters.py +++ b/webtool/lib/template_filters.py @@ -186,11 +186,6 @@ def _jinja2_filter_social_mediafy(body, datasource=""): if not datasource: return body - # Supported data sources - known_datasources = ["twitter", "tiktok", "instagram", "tumblr", "linkedin"] - if datasource not in known_datasources: - return body - # Base URLs after which tags and @-mentions follow, per platform base_urls = { "twitter": { @@ -212,38 +207,47 @@ def _jinja2_filter_social_mediafy(body, datasource=""): "linkedin": { "hashtag": "https://linkedin.com/feed/hashtag/?keywords=", "mention": "https://linkedin.com/in/" + }, + "telegram": { } } + # Supported data sources + known_datasources = list(base_urls.keys()) + if datasource not in known_datasources: + return body + # Add URL links for url in urls_from_text(body): body = re.sub(url, "%s" % (url, url), body) # Add hashtag links - tags = re.findall(r"#[\w0-9]+", body) - # We're sorting tags by length so we don't incorrectly - # replace tags that are a substring of another, longer tag. - tags = sorted(tags, key=lambda x: len(x), reverse=True) - for tag in tags: - # Match the string, but not if it's preceded by a >, which indicates that we've already added an tag. - # This avoids problems with repeated substrings (e.g. #Dog and #DogOwners). - body = re.sub(r"(?)(" + tag + ")", "%s" % (base_urls[datasource]["hashtag"] + tag[1:], tag), body) + if "hasthag" in base_urls[datasource]: + tags = re.findall(r"#[\w0-9]+", body) + # We're sorting tags by length so we don't incorrectly + # replace tags that are a substring of another, longer tag. + tags = sorted(tags, key=lambda x: len(x), reverse=True) + for tag in tags: + # Match the string, but not if it's preceded by a >, which indicates that we've already added an tag. + # This avoids problems with repeated substrings (e.g. #Dog and #DogOwners). + body = re.sub(r"(?)(" + tag + ")", "%s" % (base_urls[datasource]["hashtag"] + tag[1:], tag), body) # Add @-mention links - mentions = re.findall(r"@[\w0-9]+", body) - mentions = sorted(mentions, key=lambda x: len(x), reverse=True) - for mention in mentions: - body = re.sub(r"(?)(" + mention + ")", "%s" % (base_urls[datasource]["mention"] + mention[1:], mention), body) + if "mention" in base_urls[datasource]: + mentions = re.findall(r"@[\w0-9]+", body) + mentions = sorted(mentions, key=lambda x: len(x), reverse=True) + for mention in mentions: + body = re.sub(r"(?)(" + mention + ")", "%s" % (base_urls[datasource]["mention"] + mention[1:], mention), body) return body @app.template_filter('string_counter') -def _jinja2_filter_string_counter(string, is_emoji=False): +def _jinja2_filter_string_counter(string, emoji=False): # Returns a dictionary with counts of characters in a string. # Also handles emojis. # We need to convert multi-character emojis ("graphemes") to one character. - if is_emoji == True: + if emoji == True: string = regex.finditer(r"\X", string) # \X matches graphemes string = [m.group(0) for m in string] diff --git a/webtool/static/css/explorer/telegram.css b/webtool/static/css/explorer/telegram.css index 5b3a6c0a1..b4789b9e2 100644 --- a/webtool/static/css/explorer/telegram.css +++ b/webtool/static/css/explorer/telegram.css @@ -1,65 +1,186 @@ @font-face { - font-family: 'Open Sans'; + font-family: "Open Sans"; src: url("../fonts/OpenSans-Regular.ttf") } @font-face { - font-family: 'Open Sans'; + font-family: "Open Sans"; font-weight: bold; src: url("../fonts/OpenSans-Bold.ttf") } -@font-face { - font-family: 'Open Sans'; - font-style: italic; - src: url("../fonts/OpenSans-Italic.ttf") +* { + font-size: 15px; + line-height: 1.4; } -@font-face { - font-family: 'Open Sans'; - font-weight: bold; - font-style: italic; - src: url("../fonts/OpenSans-BoldItalic.ttf") +.explorer-content { + background-image: linear-gradient(#6ca587, #c4d18b); + padding-top: 5px; + padding-bottom: 20px; } -* { +.explorer-content ol li { + padding: 1px; + background: none; +} + +.posts .post { font-family: "Open Sans", Arial; - font-size: 16px; - line-height: 1.5; + display: block; + position: relative; + max-width: 580px; + list-style-type: none; + margin: 0 auto; } -.explorer-content ol li { - background-color: #6fa788; - padding: 1px; +.posts .post .post-container.new-group { + margin-top: 6px; +} + +/* Profile picture */ +.posts .post .profile-picture-container { + display: inline-block; + width: 60px; + vertical-align: top; +} + +.profile-picture { + background-image: linear-gradient(#389ed5, #59c8e2); + border-radius: 100%; + width: 50px; + height: 50px; + line-height: 53px; + float: left; + text-align: center; } -.posts .post-content { +.profile-picture .initials { + color: white; + font-size: 23px; + width: 100%; + height: 100%; +} + +/* Post content */ +.posts .post .post-content { + display: inline-block; + max-width: 80%; list-style-type: none; background-color: white; - width: 450px; - margin: 0 auto; - margin-top: 2px; - margin-bottom: 2px; - border-radius: 5px 15px 15px 5px; + border-radius: 5px 20px 20px 5px; padding: 12px 17px; + z-index: -1; + overflow: hidden; } -.post-content.new-group { - border-radius: 0 10px 10px 10px; - margin-top: 10px; +.posts .post .post-content.new-group { + border-radius: 0px 20px 20px 5px; } -.author { +.bubble-left { + position: relative; + margin-right: -5px; + float: right; + z-index: 0; +} + +.author, .author a, .author a:hover { + margin-bottom: 5px; color: #2984cd; font-weight: bold; + text-decoration: none; +} + +.posts .post .body { + display: inline; + padding-top: 5px; + padding-bottom: 5px; +} + +.posts .post .body a { + color: #2984cd; +} + +.posts .post .reply_to { + height: 20px; + padding: 5px; + margin-bottom: 2px; + background-color: #e4f1f9; + border-left: 4px solid #2e96d2; + border-radius: 5px; +} + +.media-container { + max-height: 200px; + margin-top: -12px; + margin-left: -17px; + margin-right: -17px; + margin-bottom: 10px; + overflow: hidden; } .media-container img { + margin-top: -155px; width: 100%; } +.post-content.new-group .media-container { + margin-top: 10px; +} + +.post-content.new-group .media-container img { + margin-top: -155px; + border-radius: 0px; +} + +/* Emoji reaction counts */ +.reactions { + margin-top: 3px; + margin-bottom: 3px; +} + +.reaction { + display: inline-block; + color: #168acd; + background-color: #e8f5fc; + font-weight: bold; + border-radius: 15px; + margin-top: 1px; + padding: 4px; + padding-left: 8px; + padding-right: 8px; + font-size: 16px; + vertical-align: middle; +} + +.reaction .reaction-count { + padding-left: 4px; + font-size: 14px; +} + +/* TOD on the bottom of the post */ +.metrics { + display: inline-block; + padding-left: 10px; + float: right; +} + +.metrics span { + font-size: 14px; + padding-left: 3px; + color: #a0acb6; +} + +/* External url button */ + +.external-url i { + color: #168acd; +} + +/* Day indicator between posts */ .day { - margin: 8px; + margin: 15px; text-align: center; color: white; } @@ -70,4 +191,41 @@ padding-right: 10px; background-color: rgba(0,0,0,.3); border-radius: 20px; +} + +/** --------------------- * + Annotation post elements + * --------------------- */ +.post-annotations { + background-image: linear-gradient(#389ed5, #59c8e2); + color: white; + border-radius: 5px 20px 20px 5px; + margin-left: 63px; +} + +.post-annotation { + padding: 15px; +} + +.post-annotation > .annotation-label { + display: inline-block; + vertical-align: middle; + text-align: right; + min-width: 150px; + margin-right: 5px; + line-height: 1.6em; + overflow-x: hidden; +} + +.post-annotation.checkbox > .post-annotation-options { + display: inline-block; +} + +.post-annotation-options { + display: inline-block; + vertical-align: top; +} + +.post-annotation-options > input { + display: inline-block; } \ No newline at end of file diff --git a/webtool/templates/explorer/datasource-templates/telegram.html b/webtool/templates/explorer/datasource-templates/telegram.html index 417a5ddbc..e7844392c 100644 --- a/webtool/templates/explorer/datasource-templates/telegram.html +++ b/webtool/templates/explorer/datasource-templates/telegram.html @@ -12,60 +12,77 @@
    {% endif %} -
    -
    - {% if new_author or new_day %} - {% set author = post.author_username if not post.author_name else post.author_name %} - {% if post.author_name %} +
    +
    + {% if new_author or new_day %}
    - {% if not pseudonymised %} - - {% for name in author_name.split() %} - {{ name[0] }} - {% endfor %} + + {% set author = post.author_username if not post.author_name else post.author_name %} + {% if not pseudonymised and author %} + + {% for name in author.split()[:2] %}{{ name[0] }}{% endfor %} {% else %} - + {% endif %} +
    -
    + - + + +
    +
    +
    + {% if not pseudonymised %} + {{ author }} + {% else %} + + {% endif %} +
    + {% else %} +
    +
    + {% endif %} + {% if post.attachment_type %} +
    + + +
    {% endif %} -
    -
    - {% if not pseudonymised %} - {{ author }} - {% else %} - +
    + {% if post.reply_to %} + {% endif %} -
    - {% else %} -
    + {{ post.body | markdown | social_mediafy(datasource="telegram") | safe }} +
    + + {% if post.reactions %} +
    + {% set reactions = post.reactions|string_counter(emoji=True) %} + {% for reaction, count in reactions.items() %} + {{ reaction }}{{ count }} + {% endfor %} +
    {% endif %} - {% if post.attachment_type %} -
    - - - -
    +
    + {% if post.views %} {{ post.views|numberify }}{% endif %} + {% if post.forwards %} {{ post.forwards|numberify }}{% endif %} + + {{ post.unix_timestamp | datetime(fmt="%H:%M", wrap=False) }} UTC + + {% if post.unix_timestamp_edited %} + + | edited {{ post.unix_timestamp_edited | datetime(fmt="%H:%M", wrap=False) }} UTC + {% endif %} -
    - {{ post.body }} -
    - - -
    - {{ post.unix_timestamp | datetime(fmt="%H:%M", wrap=False) }} UTC -
    - {% if not pseudonymised %} {% endif %} From 2204cf70799e2cab8d2f7f4ac4570dbca0cf2244 Mon Sep 17 00:00:00 2001 From: Sal Hagen Date: Wed, 10 Jul 2024 17:43:26 +0200 Subject: [PATCH 084/204] Show URLs nicely in Telegram template --- webtool/lib/template_filters.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/webtool/lib/template_filters.py b/webtool/lib/template_filters.py index d5d12385d..89a1e7336 100644 --- a/webtool/lib/template_filters.py +++ b/webtool/lib/template_filters.py @@ -218,8 +218,9 @@ def _jinja2_filter_social_mediafy(body, datasource=""): return body # Add URL links - for url in urls_from_text(body): - body = re.sub(url, "%s" % (url, url), body) + if datasource != "telegram": # Telegram has mardown links + for url in urls_from_text(body): + body = re.sub(url, "%s" % (url, url), body) # Add hashtag links if "hasthag" in base_urls[datasource]: From a904e65f7cc348ee460e83c7b477aa9babd6fe98 Mon Sep 17 00:00:00 2001 From: Sal Hagen Date: Fri, 12 Jul 2024 17:16:32 +0200 Subject: [PATCH 085/204] Add markdown text to Telegram --- datasources/telegram/search_telegram.py | 6 +++--- .../templates/explorer/datasource-templates/telegram.html | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/datasources/telegram/search_telegram.py b/datasources/telegram/search_telegram.py index 4c3ffcfc7..4ab7f9ee4 100644 --- a/datasources/telegram/search_telegram.py +++ b/datasources/telegram/search_telegram.py @@ -39,8 +39,7 @@ class SearchTelegram(Search): extension = "ndjson" # extension of result file, used internally and in UI is_local = False # Whether this datasource is locally scraped is_static = False # Whether this datasource is still updated - has_explorer_preset = True # Whether this data source has preset CSS and field settings for the Explorer - + # cache details_cache = None failures_cache = None @@ -664,6 +663,7 @@ def map_item(message): "author_name": fullname, "author_is_bot": "yes" if user_is_bot else "no", "body": message["message"], + "body_markdown": message["message_markdown"], "is_reply": is_reply, "reply_to": reply_to, "views": message["views"] if message["views"] else "", @@ -760,7 +760,7 @@ def serialize_obj(input_obj): # Store the markdown-formatted text if type(input_obj).__name__ == "Message": - mapped_obj["message"] = input_obj.text + mapped_obj["message_markdown"] = input_obj.text return mapped_obj diff --git a/webtool/templates/explorer/datasource-templates/telegram.html b/webtool/templates/explorer/datasource-templates/telegram.html index e7844392c..f3b8dd9e4 100644 --- a/webtool/templates/explorer/datasource-templates/telegram.html +++ b/webtool/templates/explorer/datasource-templates/telegram.html @@ -61,7 +61,7 @@ {% if post.reply_to %} {% endif %} - {{ post.body | markdown | social_mediafy(datasource="telegram") | safe }} + {{ post.body_markdown | markdown | social_mediafy(datasource="telegram") | safe }}
    {% if post.reactions %} From bb96af7cc27c8f18e5673200e766ef7ef1bbe2f0 Mon Sep 17 00:00:00 2001 From: Sal Hagen Date: Fri, 12 Jul 2024 17:16:42 +0200 Subject: [PATCH 086/204] Typo in Truth social search --- datasources/truth/search_truth.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datasources/truth/search_truth.py b/datasources/truth/search_truth.py index 52057e0fa..c1b10ad8a 100644 --- a/datasources/truth/search_truth.py +++ b/datasources/truth/search_truth.py @@ -35,7 +35,7 @@ def map_item(post): """ Parse Truth Social post - :param node: Data as received from Truth Social + :param post: Data as received from Truth Social :return dict: Mapped item """ From bc4f566528f946453aaf4f9d2a13ce5bcd04fdc9 Mon Sep 17 00:00:00 2001 From: Sal Hagen Date: Fri, 12 Jul 2024 17:17:27 +0200 Subject: [PATCH 087/204] Update Tumblr search so it works with the Neue Posts Format. --- datasources/tumblr/search_tumblr.py | 278 +++++++++++++++------------- 1 file changed, 146 insertions(+), 132 deletions(-) diff --git a/datasources/tumblr/search_tumblr.py b/datasources/tumblr/search_tumblr.py index 89784b9e3..ae8876a83 100644 --- a/datasources/tumblr/search_tumblr.py +++ b/datasources/tumblr/search_tumblr.py @@ -6,13 +6,17 @@ import time import pytumblr +import requests from requests.exceptions import ConnectionError from datetime import datetime +from ural import urls_from_text from common.config_manager import config from backend.lib.search import Search -from common.lib.helpers import UserInput +from common.lib.helpers import UserInput, strip_tags from common.lib.exceptions import QueryParametersException, ProcessorInterruptedException, ConfigException +from common.lib.item_mapping import MappedItem + __author__ = "Sal Hagen" __credits__ = ["Sal Hagen", "Tumblr API (api.tumblr.com)"] @@ -27,7 +31,7 @@ class SearchTumblr(Search): category = "Search" # category title = "Search Tumblr" # title displayed in UI description = "Retrieve Tumblr posts by hashtag or blog." # description displayed in UI - extension = "csv" # extension of result file, used internally and in UI + extension = "ndjson" # extension of result file, used internally and in UI is_local = False # Whether this datasource is locally scraped is_static = False # Whether this datasource is still updated @@ -88,8 +92,8 @@ def get_options(cls, parent_dataset=None, user=None): "at max. Insert tags or names of blogs, one on each line. You may insert up to ten tags or " "blogs.\n\nTumblr tags may include whitespace and commas. A `#` before the tag is optional.\n\n" "Tag search only get posts explicitly associated with the exact tag you insert here. Querying " - "`gogh` will thus not get posts only tagged with `van gogh`. Keyword search is unfortunately not " - "allowed by the [Tumblr API](https://api.tumblr.com).\n\nIf 4CAT reached its Tumblr API rate " + "`gogh` will thus not get posts only tagged with `van gogh`. Keyword search is not " + "allowed by the [Tumblr API](https://api.tumblr.com).\n\nIf this 4CAT reached its Tumblr API rate " "limit, try again 24 hours later." }, "search_scope": { @@ -181,6 +185,7 @@ def get_items(self, query): queries = parameters.get("query").split(", ") fetch_reblogs = parameters.get("fetch_reblogs", False) + # Store all info here results = [] @@ -211,12 +216,18 @@ def get_items(self, query): self.dataset.finish_with_error(f"Could not connect to Tumblr API: {client_info.get('meta', {}).get('status', '')} - {client_info.get('meta', {}).get('msg', '')}") return + # for each tag or blog, get post for query in queries: # Get posts per tag if scope == "tag": - new_results = self.get_posts_by_tag(query, max_date=max_date, min_date=min_date) + # Used for getting tagged posts, which uses requests instead. + api_key = self.parameters.get("consumer_key") + if not api_key: + api_key = SearchTumblr.get_tumblr_keys(self.owner)[0] + + new_results = self.get_posts_by_tag(query, max_date=max_date, min_date=min_date, api_key=api_key) # Get posts per blog elif scope == "blog": @@ -278,13 +289,22 @@ def get_items(self, query): self.dataset.update_status(f"ConnectionRefused: Unable to collect reblogs for post {key}") continue if reblog_post: - reblog_post = self.parse_tumblr_posts([reblog_post], reblog=True) results.append(reblog_post[0]) + # Rename some keys so it works with anonymisation + for i in range(len(results)): + for key in list(results[i].keys()): + if key.startswith("blog"): + results[i][key.replace("blog", "author")] = results[i].pop(key) + elif key == "post_url": + results[i]["author_post_url"] = results[i].pop("post_url") + elif key == "slug": + results[i]["author_post_slug"] = results[i].pop("slug") + self.job.finish() return results - def get_posts_by_tag(self, tag, max_date=None, min_date=None): + def get_posts_by_tag(self, tag, max_date=None, min_date=None, api_key=None): """ Get Tumblr posts posts with a certain tag :param tag, str: the tag you want to look for @@ -324,8 +344,21 @@ def get_posts_by_tag(self, tag, max_date=None, min_date=None): break try: - # Use the pytumblr library to make the API call - posts = self.client.tagged(tag, before=max_date, limit=20, filter="raw") + # PyTumblr does not allow to use the `npf` parameter yet + # for the `tagged` endpoint (opened a pull request), so + # we're using requests here. + params = { + "tag": tag, + "api_key": api_key, + "before": max_date, + "limit": 20, + "filter": "raw", + "npf": True + } + url = "https://api.tumblr.com/v2/tagged" + response = requests.get(url, params=params) + posts = response.json()["response"] + except ConnectionError: self.update_status("Encountered a connection error, waiting 10 seconds.") time.sleep(10) @@ -346,6 +379,7 @@ def get_posts_by_tag(self, tag, max_date=None, min_date=None): retries = 0 if check_post["id"] not in self.seen_ids: unseen_posts.append(check_post) + posts = unseen_posts # For no clear reason, the Tumblr API sometimes provides posts with a higher timestamp than requested. @@ -390,8 +424,6 @@ def get_posts_by_tag(self, tag, max_date=None, min_date=None): # Append posts to main list else: - posts = self.parse_tumblr_posts(posts) - # Get all timestamps and sort them. post_dates = sorted([post["timestamp"] for post in posts]) @@ -515,13 +547,9 @@ def get_posts_by_blog(self, blog, max_date=None, min_date=None): try: # Use the pytumblr library to make the API call - posts = self.client.posts(blog, before=max_date, limit=20, reblog_info=True, notes_info=True, filter="raw") + posts = self.client.posts(blog, before=max_date, limit=20, reblog_info=True, notes_info=True, filter="raw", npf=True) posts = posts["posts"] - #if (max_date - posts[0]["timestamp"]) > 500000: - #self.dataset.update_status("ALERT - DATES LIKELY SKIPPED") - #self.dataset.update_status([post["timestamp"] for post in posts]) - except Exception as e: self.dataset.update_status("Reached the limit of the Tumblr API. Last timestamp: %s" % str(max_date)) @@ -543,8 +571,6 @@ def get_posts_by_blog(self, blog, max_date=None, min_date=None): if "notes" in post: all_notes.append(post["notes"]) - posts = self.parse_tumblr_posts(posts) - # Get the lowest date max_date = sorted([post["timestamp"] for post in posts])[0] @@ -564,10 +590,6 @@ def get_posts_by_blog(self, blog, max_date=None, min_date=None): all_posts += posts - #if (max_date - posts[len(posts) - 1]["timestamp"]) > 500000: - #self.dataset.update_status("ALERT - DATES LIKELY SKIPPED") - #self.dataset.update_status([post["timestamp"] for post in posts]) - if len(all_posts) >= self.max_posts: self.max_posts_reached = True break @@ -576,10 +598,10 @@ def get_posts_by_blog(self, blog, max_date=None, min_date=None): return all_posts, all_notes - def get_post_notes(self, di_blogs_ids, only_text_reblogs=True): + def get_post_notes(self, blogs_ids, only_text_reblogs=True): """ Gets the post notes. - :param di_blogs_ids, dict: A dictionary with blog names as keys and post IDs as values. + :param blogs_ids, dict: A dictionary with blog names as keys and post IDs as values. :param only_text_reblogs, bool: Whether to only keep notes that are text reblogs. """ # List of dict to get reblogs. Items are: [{"blog_name": post_id}] @@ -588,14 +610,14 @@ def get_post_notes(self, di_blogs_ids, only_text_reblogs=True): max_date = None # Do some counting - len_blogs = len(di_blogs_ids) + len_blogs = len(blogs_ids) count = 0 # Stop trying to fetch the notes after this many retries max_notes_retries = 10 notes_retries = 0 - for key, value in di_blogs_ids.items(): + for key, value in blogs_ids.items(): count += 1 @@ -653,7 +675,7 @@ def get_post_by_id(self, blog_name, post_id): raise ProcessorInterruptedException("Interrupted while fetching post from Tumblr") # Request the specific post. - post = self.client.posts(blog_name, id=post_id) + post = self.client.posts(blog_name, id=post_id, npf=True) # Tumblr API can sometimes return with this kind of error: # {'meta': {'status': 500, 'msg': 'Server Error'}, 'response': {'error': 'Malformed JSON or HTML was returned.'}} @@ -740,120 +762,112 @@ def validate_query(query, request, user): del query["daterange"] query["query"] = items - query["board"] = query.get("search_scope") + "s" # used in web interface # if we made it this far, the query can be executed return query - def parse_tumblr_posts(self, posts, reblog=False): + @staticmethod + def map_item(post): """ - Function to parse Tumblr posts into the same dict items. + Parse Tumblr posts. Tumblr posts can be many different types, so some data processing is necessary. - :param posts, list: List of Tumblr posts as returned form the Tumblr API. - :param reblog, bool: Whether the post concerns a reblog of posts from the original dataset. - - returns list processed_posts, a list with dictionary items of post info. + :param posts, list: List of Tumblr posts as returned form the Tumblr API. + :param reblog, bool: Whether the post concerns a reblog of posts from the original dataset. + + :return dict: Mapped item """ - # Store processed posts here - processed_posts = [] - - media_tags = ["photo", "video", "audio"] - - # Loop through all the posts and write a row for each of them. - for post in posts: - post_type = post["type"] - - # The post's text is in different keys depending on the post type - if post_type in media_tags: - text = post["caption"] - elif post_type == "link": - text = post["description"] - elif post_type == "text" or post_type == "chat": - text = post["body"] - elif post_type == "answer": - text = post["question"] + "\n" + post["answer"] - else: - text = "" - - # Different options for video types (YouTube- or Tumblr-hosted) - if post_type == "video": - - video_source = post["video_type"] - # Use `get` since some videos are deleted - video_url = post.get("permalink_url") - - if video_source == "youtube": - # There's no URL if the YouTube video is deleted - if video_url: - video_id = post["video"]["youtube"]["video_id"] - else: - video_id = "deleted" - else: - video_id = "unknown" - - else: - video_source = None - video_id = None - video_url = None - - # All the fields to write - processed_post = { - # General columns - "type": post_type, - "timestamp": post["timestamp"], - "is_reblog": reblog, - - # Blog columns - "author": post["blog_name"], - "subject": post["blog"]["title"], - "blog_description": post["blog"]["description"], - "blog_url": post["blog"]["url"], - "blog_uuid": post["blog"]["uuid"], - "blog_last_updated": post["blog"]["updated"], - - # Post columns - "id": post["id"], - "post_url": post["post_url"], - "post_slug": post["slug"], - "thread_id": post["reblog_key"], - "body": text.replace("\x00", ""), - "tags": ", ".join(post["tags"]) if post.get("tags") else None, - "notes": post["note_count"], - "urls": post.get("link_url"), - "images": ",".join([photo["original_size"]["url"] for photo in post["photos"]]) if post.get("photos") else None, - - # Optional video columns - "video_source": video_source if post_type == "video" else None, - "video_url": video_url if post_type == "video" else None, - "video_id": video_id if post_type == "video" else None, - "video_thumb": post.get("thumbnail_url"), # Can be deleted - - # Optional audio columns - "audio_type": post.get("audio_type"), - "audio_url": post.get("audio_source_url"), - "audio_plays": post.get("plays"), - - # Optional link columns - "link_author": post.get("link_author"), - "link_publisher": post.get("publisher"), - "link_image": post.get("link_image"), - - # Optional answers columns - "asking_name": post.get("asking_name"), - "asking_url": post.get("asking_url"), - "question": post.get("question"), - "answer": post.get("answer"), - - # Optional chat columns - "chat": post.get("dialogue") - } - - # Store the processed post - processed_posts.append(processed_post) - - return processed_posts + media_types = ["photo", "video", "audio"] + + # We're getting info as Neue Text Format types, + # so we need to loop through some 'blocks'. + image_urls = [] + video_urls = [] + video_thumb_urls = [] + audio_urls = [] + audio_artists = [] + linked_urls = [] + question = "" + answers = "" + raw_text = "" + formatted_text = [] + + # Loop through "blocks" + for block in post.get("content", []): + block_type = block["type"] + + if block_type == "image": + image_urls.append(block["media"][0]["url"]) + elif block_type == "audio": + audio_urls.append(block["media"]["url"]) + audio_artists.append(block["artist"]) + elif block_type == "video": + video_urls.append(block["media"]["url"]) + if "filmstrip" in block: + video_thumb_urls.append(block["filmstrip"]["url"]) + elif block_type == "link": + linked_urls.append(block["url"]) + elif block_type == "poll": + question += block["question"] + answers = [a["answer_text"] for a in block["answers"]] + + # We're gonna add some formatting to the text + elif block_type == "text": + + text = block["text"] + + extra_chars = 0 + if block.get("formatting"): + for fmt in block["formatting"]: + + fmt_type = fmt["type"] + s = fmt["start"] + extra_chars # Start of formatted substring + e = fmt["end"] + extra_chars # End of formatted substring + + if fmt_type == "link": + text = text[:s] + "[" + text[s:e] + "](" + fmt["formatting"]["url"] + ")" + text[e:] + extra_chars += 4 + len(fmt["formatting"]["url"]) + elif fmt_type == "italic": + text = text[:s] + "*" + text[s:e] + "*" + text[e:] + extra_chars += 2 + elif fmt_type == "bold": + text = text[:s] + "**" + text[s:e] + "**" + text[e:] + extra_chars += 4 + + if block.get("subtype") == "unordered-list-item": + text = "- " + text + + raw_text += block["text"] + "\n" + formatted_text.append(text) + + return MappedItem({ + "id": post["id"], + "author": post["author_name"], + "thread_id": post["reblog_key"], + "timestamp": post["timestamp"], + "author_subject": post["author"]["title"], + "author_description": strip_tags(post["author"]["description"]), + "author_url": post["author"]["url"], + "author_uuid": post["author"]["uuid"], + "author_last_updated": post["author"]["updated"], + "author_post_url": post["author_post_url"], + "author_post_slug": post["author_post_slug"], + "body": raw_text, + "body_markdown": "\n".join(formatted_text), + "tags": ",".join(post["tags"]) if post.get("tags") else "", + "notes": post["note_count"], + "linked_urls": ",".join(linked_urls) if linked_urls else "", + "image_urls": ",".join(image_urls) if image_urls else "", + "video_urls": ",".join(video_urls) if video_urls else "", + "video_thumb_urls": ",".join(video_thumb_urls) if video_thumb_urls else "", + "audio_urls": ",".join(audio_urls) if audio_urls else "", + "audio_artist": ",".join(audio_artists) if audio_artists else "", + "author_asking_name": post.get("asking_name", ""), + "author_asking_url": post.get("asking_url", ""), + "poll_question": question, + "poll_answers": ",".join(answers) + }) def after_process(self): """ From 10c885376ff1ad63cbc7011d4d777a9b8d2d5d56 Mon Sep 17 00:00:00 2001 From: Sal Hagen Date: Mon, 15 Jul 2024 21:21:25 +0200 Subject: [PATCH 088/204] Fix notes fetching for Tumblr, add extra notes metrics to NDJSONs and `map_items` --- datasources/tumblr/DESCRIPTION.md | 4 +- datasources/tumblr/search_tumblr.py | 251 ++++++++++++++-------------- 2 files changed, 127 insertions(+), 128 deletions(-) diff --git a/datasources/tumblr/DESCRIPTION.md b/datasources/tumblr/DESCRIPTION.md index a2be57d25..8269204a1 100644 --- a/datasources/tumblr/DESCRIPTION.md +++ b/datasources/tumblr/DESCRIPTION.md @@ -7,10 +7,12 @@ Be aware that the data may contain personal information. It is thus recommended To comply with the Tumblr API requirements, Tumblr datasets are deleted after three days. ### Rate limits -4CAT uses an internal API key to get Tumblr posts. These are limited to the +If set, 4CAT uses an internal API key to get Tumblr posts. These are limited to the [following rate limits](https://www.tumblr.com/docs/en/api/v2#rate-limits). However, administrators may request a rate limit increase via Tumblr. +If no internal API key is set, you can insert your own. + ### Date bugs The [Tumblr API](https://api.tumblr.com) is volatile: when fetching sporadically used tags, it may return zero posts, even though older posts *do* exist. Check the oldest post in diff --git a/datasources/tumblr/search_tumblr.py b/datasources/tumblr/search_tumblr.py index ae8876a83..bf64702c2 100644 --- a/datasources/tumblr/search_tumblr.py +++ b/datasources/tumblr/search_tumblr.py @@ -110,9 +110,10 @@ def get_options(cls, parent_dataset=None, user=None): "help": "Tags/blogs", "tooltip": "Separate with commas or new lines." }, - "fetch_reblogs": { + "get_notes": { "type": UserInput.OPTION_TOGGLE, - "help": "Also fetch reblogs with text? (warning: slow)", + "help": "Get post notes (warning: slow)", + "tooltip": "Also retrieve post notes. Likes and replies are added to the original post. Text reblogs are added as new posts.", "default": False } } @@ -183,8 +184,7 @@ def get_items(self, query): parameters = self.dataset.get_parameters() scope = parameters.get("search_scope", "") queries = parameters.get("query").split(", ") - fetch_reblogs = parameters.get("fetch_reblogs", False) - + get_notes = parameters.get("get_notes", False) # Store all info here results = [] @@ -216,7 +216,6 @@ def get_items(self, query): self.dataset.finish_with_error(f"Could not connect to Tumblr API: {client_info.get('meta', {}).get('status', '')} - {client_info.get('meta', {}).get('msg', '')}") return - # for each tag or blog, get post for query in queries: @@ -231,8 +230,8 @@ def get_items(self, query): # Get posts per blog elif scope == "blog": - new_results, notes = self.get_posts_by_blog(query, max_date=max_date, min_date=min_date) - all_notes.append(notes) + new_results = self.get_posts_by_blog(query, max_date=max_date, min_date=min_date) + else: self.dataset.update_status("Invalid scope") break @@ -246,61 +245,35 @@ def get_items(self, query): self.dataset.update_status("API limit reached") break - # If we also want the posts that reblogged the fetched posts: - if fetch_reblogs and not self.max_posts_reached and not self.api_limit_reached: - self.dataset.update_status("Getting notes from all posts") - - # Reblog information is already returned for blog-level searches - if scope == "blog": - text_reblogs = [] - - # Loop through and add the text reblogs that came with the results. - for post_notes in all_notes: - for post_note in post_notes: - for note in post_note: - if note["type"] == "reblog": - text_reblogs.append({note["blog_name"]: note["post_id"]}) - - # Retrieving notes for tag-based posts should be done one-by-one. - # Fetching them all at once is not supported by the Tumblr API. - elif scope == "tag": - # Prepare dicts to pass to `get_post_notes` - posts_to_fetch = {result["author"]: result["id"] for result in results} - - # First extract the notes of each post, and only keep text reblogs - text_reblogs = self.get_post_notes(posts_to_fetch) - - # Get the full data for text reblogs. - if text_reblogs: - connection_retries = 0 - for i, text_reblog in enumerate(text_reblogs): - self.dataset.update_status("Got %i/%i text reblogs" % (i, len(text_reblogs))) - if connection_retries >= 5: - self.dataset.update_status("Multiple connection refused errors; unable to continue collection of reblogs.") - break - for key, value in text_reblog.items(): - if connection_retries >= 5: - break - try: - reblog_post = self.get_post_by_id(key, value) - except ConnectionRefusedError: - connection_retries += 1 - self.failed_reblogs.append(key) - self.dataset.update_status(f"ConnectionRefused: Unable to collect reblogs for post {key}") - continue - if reblog_post: - results.append(reblog_post[0]) - - # Rename some keys so it works with anonymisation - for i in range(len(results)): - for key in list(results[i].keys()): - if key.startswith("blog"): - results[i][key.replace("blog", "author")] = results[i].pop(key) - elif key == "post_url": - results[i]["author_post_url"] = results[i].pop("post_url") - elif key == "slug": - results[i]["author_post_slug"] = results[i].pop("slug") + # Loop through the results once to add note data and fetch text reblogs, + len_results = len(results) # results will change in length when we add reblogs. + for i in range(len_results): + + post = results[i] + + # Get note information + if get_notes and not self.max_posts_reached and not self.api_limit_reached: + + # Reblog information is already returned for blog-level searches + # and is stored as `notes` in the posts themselves. + # Retrieving notes for tag-based posts must be done one-by-one; + # fetching them all at once is not supported by the Tumblr API. + if not "notes" in post: + self.dataset.update_status("Getting note data for post %i/%i" % (i, len_results)) + + # Prepare dicts to pass to `get_post_notes` + notes = self.get_post_notes(post["blog_name"], post["id"]) + + if notes: + results[i]["notes"] = notes + # Get the full data for text reblogs and add them as new posts + for note in notes: + if note["type"] == "reblog": + text_reblog = self.get_post_by_id(note["blog_name"], note["post_id"]) + if text_reblog: + results.append(text_reblog) + self.job.finish() return results @@ -353,7 +326,8 @@ def get_posts_by_tag(self, tag, max_date=None, min_date=None, api_key=None): "before": max_date, "limit": 20, "filter": "raw", - "npf": True + "npf": True, + "notes_info": True } url = "https://api.tumblr.com/v2/tagged" response = requests.get(url, params=params) @@ -528,9 +502,6 @@ def get_posts_by_blog(self, blog, max_date=None, min_date=None): # Store all posts in here all_posts = [] - # Store notes here, if they exist and are requested - all_notes = [] - # Some retries to make sure the Tumblr API actually returns everything retries = 0 self.max_retries = 48 # 2 days @@ -565,11 +536,6 @@ def get_posts_by_blog(self, blog, max_date=None, min_date=None): # Append posts to main list else: - # Keep the notes, if so indicated - if self.parameters.get("fetch_reblogs"): - for post in posts: - if "notes" in post: - all_notes.append(post["notes"]) # Get the lowest date max_date = sorted([post["timestamp"] for post in posts])[0] @@ -596,72 +562,61 @@ def get_posts_by_blog(self, blog, max_date=None, min_date=None): self.dataset.update_status("Collected %s posts" % str(len(all_posts))) - return all_posts, all_notes + return all_posts - def get_post_notes(self, blogs_ids, only_text_reblogs=True): - """ - Gets the post notes. - :param blogs_ids, dict: A dictionary with blog names as keys and post IDs as values. - :param only_text_reblogs, bool: Whether to only keep notes that are text reblogs. + def get_post_notes(self, blog_id, post_id): """ - # List of dict to get reblogs. Items are: [{"blog_name": post_id}] - text_reblogs = [] + Gets data on the notes of a specific post. + :param blog_id, str: The ID of the blog. + :param post_id, str: The ID of the post. + :returns: a list with dictionaries of notes. + """ + + post_notes = [] max_date = None # Do some counting - len_blogs = len(blogs_ids) count = 0 # Stop trying to fetch the notes after this many retries max_notes_retries = 10 notes_retries = 0 - for key, value in blogs_ids.items(): - - count += 1 - - if self.interrupted: - raise ProcessorInterruptedException("Interrupted while fetching post notes from Tumblr") + count += 1 - # First, get the blog names and post_ids from reblogs - # Keep digging till there's nothing left, or if we can fetch no new notes - while True: - - # Requests a post's notes - notes = self.client.notes(key, id=value, before_timestamp=max_date) - - if only_text_reblogs: + if self.interrupted: + raise ProcessorInterruptedException("Interrupted while fetching post notes from Tumblr") - if "notes" in notes: - notes_retries = 0 + while True: - for note in notes["notes"]: - # If it's a reblog, extract the data and save the rest of the posts for later - if note["type"] == "reblog": - if note.get("added_text"): - text_reblogs.append({note["blog_name"]: note["post_id"]}) + # Requests a post's notes + notes = self.client.notes(blog_id, id=post_id, before_timestamp=max_date) + + if "notes" in notes: + notes_retries = 0 - if notes.get("_links"): - max_date = notes["_links"]["next"]["query_params"]["before_timestamp"] + for note in notes["notes"]: + post_notes.append(note) - # If there's no `_links` key, that's all. - else: - break + if notes.get("_links"): + max_date = notes["_links"]["next"]["query_params"]["before_timestamp"] - # If there's no "notes" key in the returned dict, something might be up - else: - self.dataset.update_status("Couldn't get notes for Tumblr request " + str(notes)) - notes_retries += 1 - pass + # If there's no `_links` key, that's all. + else: + break - if notes_retries > max_notes_retries: - self.failed_notes.append(key) - break + # If there's no "notes" key in the returned dict, something might be up + else: + self.dataset.update_status("Couldn't get notes for Tumblr post " + str(post_id)) + notes_retries += 1 + pass - self.dataset.update_status("Identified %i text reblogs in %i/%i notes" % (len(text_reblogs), count, len_blogs)) + if notes_retries > max_notes_retries: + self.failed_notes.append(post_id) + break - return text_reblogs + return post_notes def get_post_by_id(self, blog_name, post_id): """ @@ -674,12 +629,28 @@ def get_post_by_id(self, blog_name, post_id): if self.interrupted: raise ProcessorInterruptedException("Interrupted while fetching post from Tumblr") - # Request the specific post. - post = self.client.posts(blog_name, id=post_id, npf=True) + connection_retries = 0 + + while True: + if connection_retries >= 5: + self.dataset.update_status("Multiple connection refused errors; unable to continue collection of reblogs.") + break + try: + # Request the specific post. + post = self.client.posts(blog_name, id=post_id, npf=True) + + except ConnectionRefusedError: + connection_retries += 1 + self.failed_reblogs.append(note["id"]) + self.dataset.update_status(f"ConnectionRefused: Unable to collect reblogs for post " + note["id"]) + continue + + if post: + break # Tumblr API can sometimes return with this kind of error: # {'meta': {'status': 500, 'msg': 'Server Error'}, 'response': {'error': 'Malformed JSON or HTML was returned.'}} - if "posts" not in post: + if not post or "posts" not in post: return None # Get the first element of the list - it's always one post. @@ -780,7 +751,7 @@ def map_item(post): media_types = ["photo", "video", "audio"] - # We're getting info as Neue Text Format types, + # We're getting info as Neue Post Format types, # so we need to loop through some 'blocks'. image_urls = [] video_urls = [] @@ -792,6 +763,10 @@ def map_item(post): answers = "" raw_text = "" formatted_text = [] + authors_liked = [] + authors_reblogged = [] + authors_replied = [] + replies = [] # Loop through "blocks" for block in post.get("content", []): @@ -841,22 +816,44 @@ def map_item(post): raw_text += block["text"] + "\n" formatted_text.append(text) + # Add note data + for note in post.get("notes", []): + if note["type"] == "like": + authors_liked.append(note["blog_name"]) + elif note["type"] in ("posted", "reblog"): + # If the original post is a text reblog, it will also show up in the notes. + # We can skip these since the data is already in the main post dict. + if note["blog_name"] != post["blog_name"] and note["timestamp"] != post["timestamp"]: + authors_reblogged.append(note["blog_name"]) + elif note["type"] == "reply": + authors_replied.append(note["blog_name"]) + replies.append(note["reply_text"]) + return MappedItem({ + "type": post["original_type"] if "original_type" in post else post["type"], "id": post["id"], - "author": post["author_name"], + "author": post["blog_name"], "thread_id": post["reblog_key"], "timestamp": post["timestamp"], - "author_subject": post["author"]["title"], - "author_description": strip_tags(post["author"]["description"]), - "author_url": post["author"]["url"], - "author_uuid": post["author"]["uuid"], - "author_last_updated": post["author"]["updated"], - "author_post_url": post["author_post_url"], - "author_post_slug": post["author_post_slug"], + "author_subject": post["blog"]["title"], + "author_description": strip_tags(post["blog"]["description"]), + "author_url": post["blog"]["url"], + "author_uuid": post["blog"]["uuid"], + "author_last_updated": post["blog"]["updated"], + "author_post_url": post["post_url"], + "author_post_slug": post["slug"], + "is_reblog": True if post.get("original_type") == "note" else "", "body": raw_text, "body_markdown": "\n".join(formatted_text), "tags": ",".join(post["tags"]) if post.get("tags") else "", "notes": post["note_count"], + "like_count": len(authors_liked), + "authors_liked": ",".join(authors_liked), + "reblog_count": len(authors_reblogged), + "authors_reblogged": ",".join(authors_reblogged), + "reply_count": len(authors_replied), + "authors_replied": ",".join(authors_replied), + "replies": "\n\n".join(replies), "linked_urls": ",".join(linked_urls) if linked_urls else "", "image_urls": ",".join(image_urls) if image_urls else "", "video_urls": ",".join(video_urls) if video_urls else "", From ce16f972237c65185b6e2a51d87980c588aff2ff Mon Sep 17 00:00:00 2001 From: Sal Hagen Date: Tue, 16 Jul 2024 21:48:30 +0200 Subject: [PATCH 089/204] Make Tumblr search work with new blocks formatting, include some new content --- datasources/tumblr/search_tumblr.py | 43 ++++++++++++++++++++++------- 1 file changed, 33 insertions(+), 10 deletions(-) diff --git a/datasources/tumblr/search_tumblr.py b/datasources/tumblr/search_tumblr.py index bf64702c2..9b2c89314 100644 --- a/datasources/tumblr/search_tumblr.py +++ b/datasources/tumblr/search_tumblr.py @@ -759,6 +759,7 @@ def map_item(post): audio_urls = [] audio_artists = [] linked_urls = [] + linked_titles = [] question = "" answers = "" raw_text = "" @@ -768,8 +769,18 @@ def map_item(post): authors_replied = [] replies = [] + # Keep track if blocks belong to another post, + # which is stored in `layout`. + body_reblogged = [] + reblogged_text_blocks = [] + author_reblogged = "" + for layout_block in post.get("layout", []): + if layout_block["type"] == "ask": + reblogged_text_blocks += layout_block["blocks"] + author_reblogged = layout_block["attribution"]["blog"]["name"] + # Loop through "blocks" - for block in post.get("content", []): + for i, block in enumerate(post.get("content", [])): block_type = block["type"] if block_type == "image": @@ -783,12 +794,18 @@ def map_item(post): video_thumb_urls.append(block["filmstrip"]["url"]) elif block_type == "link": linked_urls.append(block["url"]) + if "title" in block: + linked_titles.append(block["title"]) + if "description" in block: + raw_text += block["description"] + "\n" + formatted_text.append(block["description"]) elif block_type == "poll": question += block["question"] answers = [a["answer_text"] for a in block["answers"]] # We're gonna add some formatting to the text - elif block_type == "text": + # Skip text that is part of a reblogged post. + elif block_type == "text" and i not in reblogged_text_blocks: text = block["text"] @@ -816,23 +833,28 @@ def map_item(post): raw_text += block["text"] + "\n" formatted_text.append(text) + elif block_type == "text" and i in reblogged_text_blocks: + body_reblogged.append(block["text"]) + # Add note data for note in post.get("notes", []): if note["type"] == "like": - authors_liked.append(note["blog_name"]) + # Inserting at the start of the list to maintain chronological order. + authors_liked.insert(0, note["blog_name"]) elif note["type"] in ("posted", "reblog"): # If the original post is a text reblog, it will also show up in the notes. # We can skip these since the data is already in the main post dict. if note["blog_name"] != post["blog_name"] and note["timestamp"] != post["timestamp"]: - authors_reblogged.append(note["blog_name"]) + authors_reblogged.insert(0, note["blog_name"]) elif note["type"] == "reply": - authors_replied.append(note["blog_name"]) - replies.append(note["reply_text"]) + authors_replied.insert(0, note["blog_name"]) + replies.insert(0, note["blog_name"] + ": " + note["reply_text"]) return MappedItem({ "type": post["original_type"] if "original_type" in post else post["type"], "id": post["id"], "author": post["blog_name"], + "author_avatar_url": "https://api.tumblr.com/v2/blog/" + post["blog_name"] + "/avatar", "thread_id": post["reblog_key"], "timestamp": post["timestamp"], "author_subject": post["blog"]["title"], @@ -840,11 +862,13 @@ def map_item(post): "author_url": post["blog"]["url"], "author_uuid": post["blog"]["uuid"], "author_last_updated": post["blog"]["updated"], - "author_post_url": post["post_url"], - "author_post_slug": post["slug"], + "post_url": post["post_url"], + "post_slug": post["slug"], "is_reblog": True if post.get("original_type") == "note" else "", "body": raw_text, "body_markdown": "\n".join(formatted_text), + "body_reblogged": "\n".join(body_reblogged) if body_reblogged else "", + "author_reblogged": author_reblogged, "tags": ",".join(post["tags"]) if post.get("tags") else "", "notes": post["note_count"], "like_count": len(authors_liked), @@ -855,13 +879,12 @@ def map_item(post): "authors_replied": ",".join(authors_replied), "replies": "\n\n".join(replies), "linked_urls": ",".join(linked_urls) if linked_urls else "", + "linked_urls_titles": "\n".join(linked_titles) if linked_titles else "", "image_urls": ",".join(image_urls) if image_urls else "", "video_urls": ",".join(video_urls) if video_urls else "", "video_thumb_urls": ",".join(video_thumb_urls) if video_thumb_urls else "", "audio_urls": ",".join(audio_urls) if audio_urls else "", "audio_artist": ",".join(audio_artists) if audio_artists else "", - "author_asking_name": post.get("asking_name", ""), - "author_asking_url": post.get("asking_url", ""), "poll_question": question, "poll_answers": ",".join(answers) }) From e392544eed6464613a9e429bb5e22df56bb6e887 Mon Sep 17 00:00:00 2001 From: Sal Hagen Date: Tue, 16 Jul 2024 21:48:52 +0200 Subject: [PATCH 090/204] Tumblr Explorer Template v0.5 --- webtool/static/css/explorer/tumblr.css | 250 +++++++++++++++--- .../explorer/datasource-templates/tumblr.html | 131 +++++++++ 2 files changed, 346 insertions(+), 35 deletions(-) create mode 100644 webtool/templates/explorer/datasource-templates/tumblr.html diff --git a/webtool/static/css/explorer/tumblr.css b/webtool/static/css/explorer/tumblr.css index e3ef2eaa8..1937bb3c9 100644 --- a/webtool/static/css/explorer/tumblr.css +++ b/webtool/static/css/explorer/tumblr.css @@ -1,68 +1,248 @@ +/* General stuff */ .explorer-content { background-color: #001935; + padding: 20px } -#metadata, footer { - color: white; +.posts li.post { + position: relative; + list-style-type: none; + font-family: Helvetica, sans-serif; + background-color: white; + color: black; + font-size: 16px; + margin: 0 auto; + margin-top: 20px; + padding: 0px; + border-radius: 8px; + max-width: 540px; } -.content { - font-family: "Favorit", "Helvetica Neue", "HelveticaNeue", Helvetica, Arial, sans-serif; +.author { + font-size: 13px; + font-weight: bold; } -.posts li.post { - background-color: white; - color: black; - font-size: 14px; - left: 0; +.author-avatar { + width: 32px; + margin-right: 10px; +} + +.author-avatar img { border-radius: 3px; - max-width: 540px; - padding: 0; + width: 100%; } -.posts li.post header { - display: inline-block; +/* Main author info */ +header { + display: flex; + align-items: center; + padding: 19px; text-decoration: none; - font-weight: bold; - border: none; - padding: 0px; - line-height: 1.7em; - margin: 25px; + color: black; + overflow: hidden; +} + +header a { + color: black; +} + +header .author-avatar { + display: inline-block; +} + +header .author { + display: inline-block; +} + +/* Media */ +.media-container { + width: 100%; margin-bottom: 10px; } -.posts li.post article { - padding: 0; - margin: 0; +.media-container img { + width: 100%; } -.posts li.post .post-content { +/* Post text content */ +.post-content { display: block; - margin: 25px; - margin-top: 0px; + padding: 0px 19px 0px 19px; } -.posts li.post .post-tags { +.post-content .body, .body-reblogged { + white-space: pre-wrap; + line-height: 1.5em; +} + +.reblogged-content { + margin-bottom: 19px; + display: inline-block; + max-width: 400px; + padding: 25px; + background-color: #ededed; +} + +.author-reblogged { + padding-bottom: 3px; +} + +.author-reblogged-avatar { + display: inline-block; +} + +.embedded-link { + padding: 30px; + background-color: #001935; + color: white; + text-align: center; + font-size: 18px; + border-radius: 15px; + margin-bottom: 19px; +} + +.embedded-link a { + color: white; +} + +.poll-question { + font-size: 18px; +} + +.poll-answer { + color: white; + background-color: #001935; + margin: 8px; + padding: 8px; + border-radius: 15px; + text-align: center; +} + +.posts .external-url { + position: absolute; + bottom: 0; + right: 0; + padding: 10px; +} + +.tags { + padding-top: 19px; + list-style-type: none; color: #5e5e5e; - margin-top: 20px; word-break: break-all; } -.posts li.post .author { +.tags a { + color: #5e5e5e; +} + +.tags li { + padding: 5px 5px 5px 0px; + display: inline-block; + background-color: white; +} + +/* Post footer */ +footer { + margin: 19px; + padding-top: 19px; + border-top: 1px solid rgba(0,0,0,0.13); +} + +.time { + color: #5e5e5e; +} + +/* Note metrics */ +.notes { +} + +.note-counts { + padding-top: 19px; +} + +.note-count { + display: inline-block; + color: #5a5a5a; + border-radius: 18px; + border: 1px solid #ebebeb; + padding: 9px 18px; +} + +.note-count.total { font-weight: bold; } -.posts li.post .post-image { - width: 100%; - margin-bottom: 15px; +/* Replies */ +.replies { + margin-top: 12px; + display: table; +} + +.reply { + background-color: white; + display: table-row; +} + +.reply .author-info { + display: table-cell; +} + +.reply .author-replied-avatar { + vertical-align: middle; + display: table-cell; + padding-right: 10px; +} + +.reply-content { + vertical-align: top; + margin-top: 5px; + margin-bottom: 5px; + border-radius: 18px; + border: 1px solid #ebebeb; + padding: 9px 18px; } -.posts li.post .external-url { - +.reply-content .author { + margin-bottom: 5px; } -.posts li.post .post-annotations { +/* Annotation fields */ +.post-annotations { background-color: white; - border-top: 1px solid #5e5e5e; - margin-right: 0; + color: black; + border-radius: 8px; + border-top: 1px solid #ebebeb; +} + +.post-annotation { + padding: 15px; +} + +.post-annotation input { + border-radius: 5px; +} + +.post-annotation > .annotation-label { + display: inline-block; + vertical-align: middle; + text-align: right; + min-width: 150px; + margin-right: 5px; + line-height: 1.6em; + overflow-x: hidden; +} + +.post-annotation.checkbox > .post-annotation-options { + display: inline-block; +} + +.post-annotation-options { + display: inline-block; + vertical-align: top; +} + +.post-annotation-options > input { + display: inline-block; } \ No newline at end of file diff --git a/webtool/templates/explorer/datasource-templates/tumblr.html b/webtool/templates/explorer/datasource-templates/tumblr.html new file mode 100644 index 000000000..fec6a42ed --- /dev/null +++ b/webtool/templates/explorer/datasource-templates/tumblr.html @@ -0,0 +1,131 @@ +
    + {% if not pseudonymised %} + + + + {% if post["author_avatar_url"] %} +
    + + + +
    + {% endif %} + + {{ post.get("author") }} + + {% else %} + + + {% endif %} + +
    + + +{% if post["image_urls"] or post["video_urls"] %} +
    + {% if post["image_urls"] %} + {% for image_url in post["image_urls"].split(",") %} + + {% endfor %} + {% elif post["video_thumb_urls"] %} + {% for video_thumb_url in post["video_thumb_urls"].split(",") %} + {% if not pseudonymised %}{% endif %} +
    +
    + {% if not pseudonymised %}
    {% endif %} + {% endfor %} + {% endif %} +
    +{% endif %} + +
    + + {% if post.get("body_reblogged") %} +
    +
    +
    {% if not pseudonymised %}{{ post["author_reblogged"] }}{% else %}{% endif %}
    +
    {{ post["body_reblogged"] }}
    +
    +
    +
    + {% if not pseudonymised %} + + {% endif %} +
    + {% endif %} + + {% if post.get("linked_urls") %} + {% for url in post["linked_urls"].split(",") %} + + {% endfor %} + {% endif %} + + {% if post.get("poll_question") %} +
    +
    {{ post["poll_question"] }}
    +
      + {% for poll_answer in post["poll_answers"].split(",") %} +
    • {{ poll_answer }}
    • + {% endfor %} +
    + +
    + {% endif %} + +
    {{ post.get("body_markdown") | social_mediafy(datasource='tumblr') | safe }}
    + + {% if post.get("tags") %} +
    +
      + {% for tag in post["tags"].split(",") %} +
    • #{{ tag }}
    • + {% endfor %} +
    +
    + {% endif %} +
    + +
    + + +
    Posted {{ post.timestamp | datetime(fmt="%d %b %Y, %H:%M", wrap=False) }} UTC
    + + + {% if post.notes %} +
    +
    + {{ post.get("notes") | commafy }} note{% if post.get("notes", 0) > 1 %}s{% endif %} + + {% if post.get("reblog_count") %} + {{ post.reblog_count }} + {% endif %} + + {% if post.get("like_count") %} + + {% endif %} + + {% if post.get("reply_count") %} + {{ post.get("reply_count") }} + {% endif %} +
    + {% if post.get("authors_replied") %} +
    + {% for author_replied in post.get("authors_replied").split(",") %} +
  • +
    + {% if not pseudonymised %} + + {% endif %} +
    +
    +
    {% if not pseudonymised %}{{ author_replied }}{% else %}{% endif %}
    +
    {{ post.replies.split("\n\n")[ loop.index - 1 ].replace(author_replied + ": ", "") }}
    +
    +
  • + {% endfor %} +
    + {% endif %} +
    + {% endif %} + +
    \ No newline at end of file From 40f5fa09a015940b61b628a42799f05d69e4c33a Mon Sep 17 00:00:00 2001 From: sal-phd-desktop Date: Wed, 17 Jul 2024 17:48:30 +0200 Subject: [PATCH 091/204] Bump PyTumblr version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index b75724a23..ad8eca1d0 100644 --- a/setup.py +++ b/setup.py @@ -44,7 +44,7 @@ "psycopg2~=2.9.0", "pyahocorasick~=1.4.0", "PyMySQL~=1.0", - "PyTumblr==0.1.0", + "PyTumblr==0.1.2", "requests~=2.27", "requests_futures", "scenedetect==0.6.0.3", From 9e486ad7fa3476cc934db5436929d4766b181f19 Mon Sep 17 00:00:00 2001 From: sal-phd-desktop Date: Wed, 17 Jul 2024 17:48:57 +0200 Subject: [PATCH 092/204] Dashes are okay for Tumblr Blog names --- webtool/lib/template_filters.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/webtool/lib/template_filters.py b/webtool/lib/template_filters.py index 89a1e7336..3bc75ebb9 100644 --- a/webtool/lib/template_filters.py +++ b/webtool/lib/template_filters.py @@ -229,13 +229,13 @@ def _jinja2_filter_social_mediafy(body, datasource=""): # replace tags that are a substring of another, longer tag. tags = sorted(tags, key=lambda x: len(x), reverse=True) for tag in tags: - # Match the string, but not if it's preceded by a >, which indicates that we've already added an tag. + # Match the string, but not if it's preceded by a >, which indicates that we've already added an anchor tag. # This avoids problems with repeated substrings (e.g. #Dog and #DogOwners). body = re.sub(r"(?)(" + tag + ")", "%s" % (base_urls[datasource]["hashtag"] + tag[1:], tag), body) # Add @-mention links if "mention" in base_urls[datasource]: - mentions = re.findall(r"@[\w0-9]+", body) + mentions = re.findall(r"@[\w0-9-]+", body) mentions = sorted(mentions, key=lambda x: len(x), reverse=True) for mention in mentions: body = re.sub(r"(?)(" + mention + ")", "%s" % (base_urls[datasource]["mention"] + mention[1:], mention), body) From 9837a8a48102939c56e0f9743fc2a63566308b5a Mon Sep 17 00:00:00 2001 From: sal-phd-desktop Date: Wed, 17 Jul 2024 17:49:10 +0200 Subject: [PATCH 093/204] Better styling for Tumblr Explorer Template --- webtool/static/css/explorer/tumblr.css | 94 +++++++++++++++++--------- 1 file changed, 61 insertions(+), 33 deletions(-) diff --git a/webtool/static/css/explorer/tumblr.css b/webtool/static/css/explorer/tumblr.css index 1937bb3c9..cc33bc29d 100644 --- a/webtool/static/css/explorer/tumblr.css +++ b/webtool/static/css/explorer/tumblr.css @@ -57,14 +57,34 @@ header .author { /* Media */ .media-container { - width: 100%; - margin-bottom: 10px; + position: relative; + margin: 3px -19px 3px -19px; + overflow-x: hidden; } .media-container img { width: 100%; } +.media-container.video img { + min-height: 300px; + width: auto; + filter: blur(1.5rem); +} + +.play-button { + position: absolute; + width: 100%; + top: 38%; + left: 45%; + font-size: 80px; +} + +.play-button i { + color: white; + opacity: .7; +} + /* Post text content */ .post-content { display: block; @@ -76,6 +96,10 @@ header .author { line-height: 1.5em; } +.post-content .body { + padding: 3px 0px 3px 0px; +} + .reblogged-content { margin-bottom: 19px; display: inline-block; @@ -107,27 +131,21 @@ header .author { } .poll-question { - font-size: 18px; + font-size: 20px; + padding: 3px 0px 3px 0px; } .poll-answer { color: white; background-color: #001935; margin: 8px; - padding: 8px; - border-radius: 15px; - text-align: center; -} - -.posts .external-url { - position: absolute; - bottom: 0; - right: 0; - padding: 10px; + padding: 8px; + border-radius: 15px; + text-align: center; } .tags { - padding-top: 19px; + padding-top: 5px; list-style-type: none; color: #5e5e5e; word-break: break-all; @@ -154,6 +172,14 @@ footer { color: #5e5e5e; } +.posts .external-url { + color: #00b4fa; + position: absolute; + top: 0; + right: 0; + padding: 15px; +} + /* Note metrics */ .notes { } @@ -191,8 +217,8 @@ footer { .reply .author-replied-avatar { vertical-align: middle; - display: table-cell; - padding-right: 10px; + display: table-cell; + padding-right: 10px; } .reply-content { @@ -202,47 +228,49 @@ footer { border-radius: 18px; border: 1px solid #ebebeb; padding: 9px 18px; + font-size: 14px; + color: #5e5e5e; } .reply-content .author { + color: black; margin-bottom: 5px; } /* Annotation fields */ .post-annotations { - background-color: white; - color: black; - border-radius: 8px; - border-top: 1px solid #ebebeb; + background-color: #7c5cff; + color: white; + border-radius: 0px 0px 8px 8px; } .post-annotation { - padding: 15px; + padding: 15px; } .post-annotation input { - border-radius: 5px; + border-radius: 5px; } .post-annotation > .annotation-label { - display: inline-block; - vertical-align: middle; - text-align: right; - min-width: 150px; - margin-right: 5px; - line-height: 1.6em; - overflow-x: hidden; + display: inline-block; + vertical-align: middle; + text-align: right; + min-width: 150px; + margin-right: 5px; + line-height: 1.6em; + overflow-x: hidden; } .post-annotation.checkbox > .post-annotation-options { - display: inline-block; + display: inline-block; } .post-annotation-options { - display: inline-block; - vertical-align: top; + display: inline-block; + vertical-align: top; } .post-annotation-options > input { - display: inline-block; + display: inline-block; } \ No newline at end of file From 4d19e5230f6342dfeed0e9961f6998284c565eaa Mon Sep 17 00:00:00 2001 From: sal-phd-desktop Date: Wed, 17 Jul 2024 17:49:35 +0200 Subject: [PATCH 094/204] Include post blocks in the right order in Tumblr Explorer Template --- .../explorer/datasource-templates/tumblr.html | 91 +++++++++++-------- 1 file changed, 53 insertions(+), 38 deletions(-) diff --git a/webtool/templates/explorer/datasource-templates/tumblr.html b/webtool/templates/explorer/datasource-templates/tumblr.html index fec6a42ed..09f9d06a2 100644 --- a/webtool/templates/explorer/datasource-templates/tumblr.html +++ b/webtool/templates/explorer/datasource-templates/tumblr.html @@ -20,23 +20,6 @@ - -{% if post["image_urls"] or post["video_urls"] %} -
    - {% if post["image_urls"] %} - {% for image_url in post["image_urls"].split(",") %} - - {% endfor %} - {% elif post["video_thumb_urls"] %} - {% for video_thumb_url in post["video_thumb_urls"].split(",") %} - {% if not pseudonymised %}{% endif %} -
    -
    - {% if not pseudonymised %}
    {% endif %} - {% endfor %} - {% endif %} -
    -{% endif %}
    @@ -53,26 +36,58 @@ {% endif %}
    {% endif %} - - {% if post.get("linked_urls") %} - {% for url in post["linked_urls"].split(",") %} - - {% endfor %} - {% endif %} - - {% if post.get("poll_question") %} -
    -
    {{ post["poll_question"] }}
    -
      - {% for poll_answer in post["poll_answers"].split(",") %} -
    • {{ poll_answer }}
    • - {% endfor %} -
    -
    - {% endif %} - -
    {{ post.get("body_markdown") | social_mediafy(datasource='tumblr') | safe }}
    + + + {% set block_counts = namespace({'text': 0, 'image': 0, 'video': 0, 'audio': 0, 'link': 0}) %} + {% for block in post.content_order.split(",") %} + + {% if block == "text" %} +

    {{ post.get("body_markdown").split("\n")[block_counts.text] | social_mediafy(datasource='tumblr') | safe }}

    + {% set block_counts.text = block_counts.text + 1 %} + + {% elif block == "image" %} +
    + +
    + {% set block_counts.image = block_counts.image + 1 %} + + {% elif block == "video" %} + + {% set block_counts.video = block_counts.video + 1 %} + + {% elif block == "audio" %} +
    + +
    +
    +
    + {% set block_counts.audio = block_counts.audio + 1 %} + + {% elif block == "link" %} + {% set url = post.linked_urls.split(",")[block_counts.link] %} + {% set url_title = post.linked_urls_titles.split(",")[block_counts.link] %} + + {% set block_counts.link = block_counts.link + 1 %} + + {% elif block == "poll" %} + +
    +
    {{ post["poll_question"] }}
    +
      + {% for poll_answer in post["poll_answers"].split(",") %} +
    • {{ poll_answer }}
    • + {% endfor %} +
    +
    + {% endif %} + {% endfor %} + {% if post.get("tags") %}
    @@ -88,7 +103,7 @@
    -
    Posted {{ post.timestamp | datetime(fmt="%d %b %Y, %H:%M", wrap=False) }} UTC
    +
    {{ post.timestamp | datetime(fmt="%d %b %Y, %H:%M", wrap=False) }} UTC
    {% if post.notes %} @@ -119,7 +134,7 @@
    {% if not pseudonymised %}{{ author_replied }}{% else %}{% endif %}
    -
    {{ post.replies.split("\n\n")[ loop.index - 1 ].replace(author_replied + ": ", "") }}
    +
    {{ post.replies.split("\n\n")[ loop.index - 1 ].replace(author_replied + ": ", "") | social_mediafy(datasource='tumblr') | safe }}
    {% endfor %} From d5d14e03249ebcf0c7d33f8fef118ab489c3be9a Mon Sep 17 00:00:00 2001 From: sal-phd-desktop Date: Wed, 17 Jul 2024 17:50:10 +0200 Subject: [PATCH 095/204] Get block orders and start changing how note retrieval works in Tumblr search --- datasources/tumblr/search_tumblr.py | 227 ++++++++++++++++------------ 1 file changed, 129 insertions(+), 98 deletions(-) diff --git a/datasources/tumblr/search_tumblr.py b/datasources/tumblr/search_tumblr.py index 9b2c89314..ee23e2f30 100644 --- a/datasources/tumblr/search_tumblr.py +++ b/datasources/tumblr/search_tumblr.py @@ -49,7 +49,7 @@ class SearchTumblr(Search): seen_ids = set() client = None failed_notes = [] - failed_reblogs = [] + failed_posts = [] config = { # Tumblr API keys to use for data capturing @@ -219,31 +219,31 @@ def get_items(self, query): # for each tag or blog, get post for query in queries: - # Get posts per tag - if scope == "tag": - # Used for getting tagged posts, which uses requests instead. - api_key = self.parameters.get("consumer_key") - if not api_key: - api_key = SearchTumblr.get_tumblr_keys(self.owner)[0] + # Get posts per tag + if scope == "tag": + # Used for getting tagged posts, which uses requests instead. + api_key = self.parameters.get("consumer_key") + if not api_key: + api_key = SearchTumblr.get_tumblr_keys(self.owner)[0] - new_results = self.get_posts_by_tag(query, max_date=max_date, min_date=min_date, api_key=api_key) + new_results = self.get_posts_by_tag(query, max_date=max_date, min_date=min_date, api_key=api_key) - # Get posts per blog - elif scope == "blog": - new_results = self.get_posts_by_blog(query, max_date=max_date, min_date=min_date) + # Get posts per blog + elif scope == "blog": + new_results = self.get_posts_by_blog(query, max_date=max_date, min_date=min_date) - else: - self.dataset.update_status("Invalid scope") - break + else: + self.dataset.update_status("Invalid scope") + break - results += new_results + results += new_results - if self.max_posts_reached: - self.dataset.update_status("Max posts exceeded") - break - if self.api_limit_reached: - self.dataset.update_status("API limit reached") - break + if self.max_posts_reached: + self.dataset.update_status("Max posts exceeded") + break + if self.api_limit_reached: + self.dataset.update_status("API limit reached") + break # Loop through the results once to add note data and fetch text reblogs, len_results = len(results) # results will change in length when we add reblogs. @@ -259,10 +259,10 @@ def get_items(self, query): # Retrieving notes for tag-based posts must be done one-by-one; # fetching them all at once is not supported by the Tumblr API. if not "notes" in post: - self.dataset.update_status("Getting note data for post %i/%i" % (i, len_results)) + self.dataset.update_status("Retrieving notes for post %i/%i" % (i, len_results)) - # Prepare dicts to pass to `get_post_notes` - notes = self.get_post_notes(post["blog_name"], post["id"]) + notes = self.get_notes(post["blog_name"], post["id"]) + time.sleep(.2) if notes: results[i]["notes"] = notes @@ -273,6 +273,7 @@ def get_items(self, query): text_reblog = self.get_post_by_id(note["blog_name"], note["post_id"]) if text_reblog: results.append(text_reblog) + time.sleep(.2) self.job.finish() return results @@ -334,7 +335,7 @@ def get_posts_by_tag(self, tag, max_date=None, min_date=None, api_key=None): posts = response.json()["response"] except ConnectionError: - self.update_status("Encountered a connection error, waiting 10 seconds.") + self.update_status("Encountered a connection error, waiting 10 seconds") time.sleep(10) retries += 1 continue @@ -382,11 +383,9 @@ def get_posts_by_tag(self, tag, max_date=None, min_date=None, api_key=None): if date_retries < 96: max_date -= 21600 # Decrease by six hours - self.dataset.update_status("Collected %s posts for tag %s, but no new posts returned - decreasing time search with 6 hours to %s to make sure this is really it (retry %s/96)" % (str(len(all_posts)), tag, max_date_str, str(date_retries),)) elif date_retries <= self.max_date_retries: max_date -= 604800 # Decrease by one week - retry_str = str(date_retries - 96) - self.dataset.update_status("Collected %s posts for tag %s, but no new posts returned - no new posts found with decreasing by 6 hours, decreasing with a week to %s instead (retry %s/150)" % (str(len(all_posts)), tag, max_date_str, str(retry_str),)) + self.dataset.update_status("No new posts found for #%s - looking for posts before %s" % (tag, datetime.fromtimestamp(max_date).strftime("%Y-%m-%d %H:%M:%S"))) # We can stop when the max date drops below the min date. if min_date: @@ -481,7 +480,8 @@ def get_posts_by_tag(self, tag, max_date=None, min_date=None, api_key=None): self.max_posts_reached = True break - self.dataset.update_status("Collected %s posts for tag %s, now looking for posts before %s" % (str(len(all_posts)), tag, max_date_str,)) + self.dataset.update_status("Collected %s posts for #%s, retrieving posts before %s" % (str(len(all_posts)), tag, max_date_str,)) + time.sleep(.2) return all_posts @@ -522,7 +522,6 @@ def get_posts_by_blog(self, blog, max_date=None, min_date=None): posts = posts["posts"] except Exception as e: - self.dataset.update_status("Reached the limit of the Tumblr API. Last timestamp: %s" % str(max_date)) self.api_limit_reached = True break @@ -560,11 +559,54 @@ def get_posts_by_blog(self, blog, max_date=None, min_date=None): self.max_posts_reached = True break - self.dataset.update_status("Collected %s posts" % str(len(all_posts))) + self.dataset.update_status("Collected %s posts for blog %s" % str(len(all_posts), blog)) + time.sleep(.2) return all_posts - def get_post_notes(self, blog_id, post_id): + def get_post_by_id(self, blog_name, post_id): + """ + Fetch individual posts + :param blog_name, str: The blog's name + :param id, int: The post ID + + returns result list, a list with a dictionary with the post's information + """ + if self.interrupted: + raise ProcessorInterruptedException("Interrupted while fetching post from Tumblr") + + connection_retries = 0 + + while True: + if connection_retries >= 5: + self.dataset.update_status("Too many connection errors; unable to collect post %s" % post_id) + break + try: + # Request the specific post. + post = self.client.posts(blog_name, id=post_id, npf=True, reblog_info=True, notes_info=True, filter="raw") + + except ConnectionRefusedError: + connection_retries += 1 + self.failed_posts.append(note["id"]) + self.dataset.update_status("ConnectionRefused: Unable to collect reblogs for post %s" % post_id) + time.sleep(10) + continue + + if post: + break + time.sleep(.2) + + # Tumblr API can sometimes return with this kind of error: + # {'meta': {'status': 500, 'msg': 'Server Error'}, 'response': {'error': 'Malformed JSON or HTML was returned.'}} + if not post or "posts" not in post: + return None + + # Get the first element of the list - it's always one post. + result = post["posts"][0] + + return result + + def get_notes(self, blog_id, post_id): """ Gets data on the notes of a specific post. :param blog_id, str: The ID of the blog. @@ -579,6 +621,9 @@ def get_post_notes(self, blog_id, post_id): # Do some counting count = 0 + # Some posts have tens of thousands of notes + # so we'll cap this at 100 + # Stop trying to fetch the notes after this many retries max_notes_retries = 10 notes_retries = 0 @@ -590,9 +635,27 @@ def get_post_notes(self, blog_id, post_id): while True: + if notes_retries >= max_notes_retries: + self.dataset.update_status("Too many connection errors; unable to collect notes for post %s" % post_id) + self.failed_posts.append(post_id) + break + # Requests a post's notes - notes = self.client.notes(blog_id, id=post_id, before_timestamp=max_date) - + try: + notes = self.client.notes(blog_id, id=post_id, before_timestamp=max_date) + print(notes) + except ConnectionRefusedError: + self.dataset.update_status("Couldn't get notes for post %s (ConnectionRefusedError), trying again" % post_id) + notes_retries += 1 + time.sleep(10) + continue + + except Exception as e: + # Stop with unknown errors + self.dataset.update_status("Couldn't get notes for post %s. Unknown error: %s" % (post_id, e)) + notes_retries += 1 + break + if "notes" in notes: notes_retries = 0 @@ -600,7 +663,9 @@ def get_post_notes(self, blog_id, post_id): post_notes.append(note) if notes.get("_links"): + print("more notes for " + str(blog_id) + " " + str(post_id)) max_date = notes["_links"]["next"]["query_params"]["before_timestamp"] + time.sleep(.2) # If there's no `_links` key, that's all. else: @@ -608,55 +673,11 @@ def get_post_notes(self, blog_id, post_id): # If there's no "notes" key in the returned dict, something might be up else: - self.dataset.update_status("Couldn't get notes for Tumblr post " + str(post_id)) notes_retries += 1 - pass - - if notes_retries > max_notes_retries: - self.failed_notes.append(post_id) - break - - return post_notes - - def get_post_by_id(self, blog_name, post_id): - """ - Fetch individual posts - :param blog_name, str: The blog's name - :param id, int: The post ID - - returns result list, a list with a dictionary with the post's information - """ - if self.interrupted: - raise ProcessorInterruptedException("Interrupted while fetching post from Tumblr") - - connection_retries = 0 - - while True: - if connection_retries >= 5: - self.dataset.update_status("Multiple connection refused errors; unable to continue collection of reblogs.") - break - try: - # Request the specific post. - post = self.client.posts(blog_name, id=post_id, npf=True) - - except ConnectionRefusedError: - connection_retries += 1 - self.failed_reblogs.append(note["id"]) - self.dataset.update_status(f"ConnectionRefused: Unable to collect reblogs for post " + note["id"]) + time.sleep(1) continue - - if post: - break - # Tumblr API can sometimes return with this kind of error: - # {'meta': {'status': 500, 'msg': 'Server Error'}, 'response': {'error': 'Malformed JSON or HTML was returned.'}} - if not post or "posts" not in post: - return None - - # Get the first element of the list - it's always one post. - result = post["posts"][0] - - return result + return post_notes @staticmethod def get_tumblr_keys(user): @@ -762,8 +783,9 @@ def map_item(post): linked_titles = [] question = "" answers = "" - raw_text = "" + raw_text = [] formatted_text = [] + content_order = [] # To retain the order in which post blocks appear authors_liked = [] authors_reblogged = [] authors_replied = [] @@ -792,6 +814,8 @@ def map_item(post): video_urls.append(block["media"]["url"]) if "filmstrip" in block: video_thumb_urls.append(block["filmstrip"]["url"]) + elif "poster" in block: + video_thumb_urls.append(block["poster"][0]["url"]) elif block_type == "link": linked_urls.append(block["url"]) if "title" in block: @@ -800,8 +824,9 @@ def map_item(post): raw_text += block["description"] + "\n" formatted_text.append(block["description"]) elif block_type == "poll": - question += block["question"] - answers = [a["answer_text"] for a in block["answers"]] + # Only one poll can be added per post + question = block["question"] + answers = ",".join([a["answer_text"] for a in block["answers"]]) # We're gonna add some formatting to the text # Skip text that is part of a reblogged post. @@ -830,11 +855,16 @@ def map_item(post): if block.get("subtype") == "unordered-list-item": text = "- " + text - raw_text += block["text"] + "\n" + raw_text.append(block["text"]) formatted_text.append(text) elif block_type == "text" and i in reblogged_text_blocks: body_reblogged.append(block["text"]) + # Reblogged text is not considered as an ordered post block, + # as it is always put first. + continue + + content_order.append(block_type) # Add note data for note in post.get("notes", []): @@ -865,11 +895,12 @@ def map_item(post): "post_url": post["post_url"], "post_slug": post["slug"], "is_reblog": True if post.get("original_type") == "note" else "", - "body": raw_text, + "body": "\n".join(raw_text), "body_markdown": "\n".join(formatted_text), - "body_reblogged": "\n".join(body_reblogged) if body_reblogged else "", + "body_reblogged": "\n".join(body_reblogged), + "content_order": ",".join(content_order), "author_reblogged": author_reblogged, - "tags": ",".join(post["tags"]) if post.get("tags") else "", + "tags": ",".join(post.get("tags", "")), "notes": post["note_count"], "like_count": len(authors_liked), "authors_liked": ",".join(authors_liked), @@ -878,15 +909,15 @@ def map_item(post): "reply_count": len(authors_replied), "authors_replied": ",".join(authors_replied), "replies": "\n\n".join(replies), - "linked_urls": ",".join(linked_urls) if linked_urls else "", - "linked_urls_titles": "\n".join(linked_titles) if linked_titles else "", - "image_urls": ",".join(image_urls) if image_urls else "", - "video_urls": ",".join(video_urls) if video_urls else "", - "video_thumb_urls": ",".join(video_thumb_urls) if video_thumb_urls else "", - "audio_urls": ",".join(audio_urls) if audio_urls else "", - "audio_artist": ",".join(audio_artists) if audio_artists else "", + "linked_urls": ",".join(linked_urls), + "linked_urls_titles": "\n".join(linked_titles), + "image_urls": ",".join(image_urls), + "video_urls": ",".join(video_urls), + "video_thumb_urls": ",".join(video_thumb_urls), + "audio_urls": ",".join(audio_urls), + "audio_artist": ",".join(audio_artists), "poll_question": question, - "poll_answers": ",".join(answers) + "poll_answers": answers }) def after_process(self): @@ -900,8 +931,8 @@ def after_process(self): errors = [] if len(self.failed_notes) > 0: errors.append("API error(s) when fetching notes %s" % ", ".join(self.failed_notes)) - if len(self.failed_reblogs) > 0: - errors.append("API error(s) when fetching reblogs %s" % ", ".join(self.failed_reblogs)) + if len(self.failed_posts) > 0: + errors.append("API error(s) when fetching reblogs %s" % ", ".join(self.failed_posts)) if errors: self.dataset.log(";\n ".join(errors)) - self.dataset.update_status(f"Dataset completed but failed to capture some notes/reblogs; see log for details.") + self.dataset.update_status(f"Dataset completed but failed to capture some notes/reblogs; see log for details") From 8e885f013b2bf3e22cc29d61a7ded726ed5aebca Mon Sep 17 00:00:00 2001 From: Sal Hagen Date: Wed, 17 Jul 2024 23:09:44 +0200 Subject: [PATCH 096/204] Fix Markdown, include audio and video, and follow correct block order in Tumblr Template --- datasources/tumblr/search_tumblr.py | 106 +++++++++++++++++++--------- 1 file changed, 73 insertions(+), 33 deletions(-) diff --git a/datasources/tumblr/search_tumblr.py b/datasources/tumblr/search_tumblr.py index ee23e2f30..536981a1f 100644 --- a/datasources/tumblr/search_tumblr.py +++ b/datasources/tumblr/search_tumblr.py @@ -7,6 +7,7 @@ import time import pytumblr import requests +import re from requests.exceptions import ConnectionError from datetime import datetime from ural import urls_from_text @@ -765,22 +766,20 @@ def map_item(post): Tumblr posts can be many different types, so some data processing is necessary. :param posts, list: List of Tumblr posts as returned form the Tumblr API. - :param reblog, bool: Whether the post concerns a reblog of posts from the original dataset. :return dict: Mapped item """ media_types = ["photo", "video", "audio"] - # We're getting info as Neue Post Format types, - # so we need to loop through some 'blocks'. image_urls = [] video_urls = [] video_thumb_urls = [] audio_urls = [] audio_artists = [] - linked_urls = [] - linked_titles = [] + link_urls = [] + link_titles = [] + link_descriptions = [] question = "" answers = "" raw_text = [] @@ -801,57 +800,88 @@ def map_item(post): reblogged_text_blocks += layout_block["blocks"] author_reblogged = layout_block["attribution"]["blog"]["name"] - # Loop through "blocks" + # We're getting info as Neue Post Format types, + # so we need to loop through and join some content 'blocks'. for i, block in enumerate(post.get("content", [])): + block_type = block["type"] + # Image if block_type == "image": image_urls.append(block["media"][0]["url"]) + # Audio file elif block_type == "audio": - audio_urls.append(block["media"]["url"]) - audio_artists.append(block["artist"]) + audio_urls.append(block["url"] if "url" in block else block["media"]["url"]) + audio_artists.append(block.get("artist", "")) + # Video (embedded or hosted) elif block_type == "video": - video_urls.append(block["media"]["url"]) + if "media" in block: + video_urls.append(block["media"]["url"]) + elif "url" in block: + video_urls.append(block["url"]) if "filmstrip" in block: video_thumb_urls.append(block["filmstrip"]["url"]) elif "poster" in block: - video_thumb_urls.append(block["poster"][0]["url"]) + video_thumb_urls.append(block["poster"][0]["url"]) + else: + video_thumb_urls.append("") + # Embedded link elif block_type == "link": - linked_urls.append(block["url"]) + link_urls.append(block["url"]) if "title" in block: - linked_titles.append(block["title"]) + link_titles.append(block["title"]) if "description" in block: - raw_text += block["description"] + "\n" - formatted_text.append(block["description"]) + link_descriptions.append(block["description"]) + # Poll elif block_type == "poll": # Only one poll can be added per post question = block["question"] answers = ",".join([a["answer_text"] for a in block["answers"]]) - # We're gonna add some formatting to the text - # Skip text that is part of a reblogged post. + # Text + # Here we're adding Markdown formatting. + # We skip text that is part of a reblogged post. elif block_type == "text" and i not in reblogged_text_blocks: text = block["text"] - extra_chars = 0 if block.get("formatting"): + + # Dict with index numbers as keys where inserts need to be made, + # and the replacement strings as values. Done this way so we know + # when multiple formatting operations need to be made at the same + # index position. + insert_indexes = set() + inserts = {} + for fmt in block["formatting"]: - fmt_type = fmt["type"] - s = fmt["start"] + extra_chars # Start of formatted substring - e = fmt["end"] + extra_chars # End of formatted substring - - if fmt_type == "link": - text = text[:s] + "[" + text[s:e] + "](" + fmt["formatting"]["url"] + ")" + text[e:] - extra_chars += 4 + len(fmt["formatting"]["url"]) - elif fmt_type == "italic": - text = text[:s] + "*" + text[s:e] + "*" + text[e:] - extra_chars += 2 - elif fmt_type == "bold": - text = text[:s] + "**" + text[s:e] + "**" + text[e:] - extra_chars += 4 - + if fmt["type"] in ("link", "bold", "italic"): + s = fmt["start"] + e = fmt["end"] + + opening = True # So we know if the styles need to be appended or prepended + for i in [s, e]: + insert_indexes.add(i) + i = str(i) + if i not in inserts: + inserts[i] = "" + if fmt_type == "link" and opening: + inserts[i] = inserts[i] + "[" + elif fmt_type == "link" and not opening: + inserts[i] = "](" + fmt["url"] + ")" + inserts[i] + elif fmt_type == "italic": + inserts[i] = "*" + inserts[i] if opening else inserts[i] + "*" + elif fmt_type == "bold": + inserts[i] = "**" + inserts[i] if opening else inserts[i] + "**" + opening = False + if inserts: + extra_chars = 0 + for i, insert in inserts.items(): + i = int(i) + extra_chars + text = text[:i] + insert + text[i:] + extra_chars += len(insert) + if block.get("subtype") == "unordered-list-item": text = "- " + text @@ -866,6 +896,15 @@ def map_item(post): content_order.append(block_type) + # Sometimes the order is reshuffled in the `layout` property... + if post.get("layout"): + if "type" in post["layout"][0]: + if post["layout"][0]["type"] == "rows": + new_content_order = [] + for i in post["layout"][0].get("display", []): + new_content_order.append(content_order[i["blocks"][0]]) + content_order = new_content_order + # Add note data for note in post.get("notes", []): if note["type"] == "like": @@ -909,8 +948,9 @@ def map_item(post): "reply_count": len(authors_replied), "authors_replied": ",".join(authors_replied), "replies": "\n\n".join(replies), - "linked_urls": ",".join(linked_urls), - "linked_urls_titles": "\n".join(linked_titles), + "link_urls": ",".join(link_urls), + "link_titles": "\n".join(link_titles), + "link_descriptions": "\n".join(link_descriptions), "image_urls": ",".join(image_urls), "video_urls": ",".join(video_urls), "video_thumb_urls": ",".join(video_thumb_urls), From c7fa5fa1a7cece8a778aa2661be0c26efea22e0d Mon Sep 17 00:00:00 2001 From: Sal Hagen Date: Wed, 17 Jul 2024 23:10:07 +0200 Subject: [PATCH 097/204] Skip URLs in social mediafy template filter if it's already markdown --- webtool/lib/template_filters.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/webtool/lib/template_filters.py b/webtool/lib/template_filters.py index 3bc75ebb9..f04546d1a 100644 --- a/webtool/lib/template_filters.py +++ b/webtool/lib/template_filters.py @@ -202,13 +202,15 @@ def _jinja2_filter_social_mediafy(body, datasource=""): }, "tumblr": { "hashtag": "https://tumblr.com/tagged/", - "mention": "https://tumblr.com/" + "mention": "https://tumblr.com/", + "markdown": True }, "linkedin": { "hashtag": "https://linkedin.com/feed/hashtag/?keywords=", "mention": "https://linkedin.com/in/" }, "telegram": { + "markdown": True } } @@ -218,12 +220,12 @@ def _jinja2_filter_social_mediafy(body, datasource=""): return body # Add URL links - if datasource != "telegram": # Telegram has mardown links + if not base_urls[datasource].get("markdown"): for url in urls_from_text(body): body = re.sub(url, "%s" % (url, url), body) # Add hashtag links - if "hasthag" in base_urls[datasource]: + if "hashtag" in base_urls[datasource]: tags = re.findall(r"#[\w0-9]+", body) # We're sorting tags by length so we don't incorrectly # replace tags that are a substring of another, longer tag. From 281bf568e06d27a0898c25bdee383b9b241bd6f1 Mon Sep 17 00:00:00 2001 From: Sal Hagen Date: Wed, 17 Jul 2024 23:10:20 +0200 Subject: [PATCH 098/204] add markdown --- webtool/static/css/explorer/tumblr.css | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/webtool/static/css/explorer/tumblr.css b/webtool/static/css/explorer/tumblr.css index cc33bc29d..e1299086b 100644 --- a/webtool/static/css/explorer/tumblr.css +++ b/webtool/static/css/explorer/tumblr.css @@ -72,6 +72,14 @@ header .author { filter: blur(1.5rem); } +.media-container.audio { + margin: initial; +} + +.media-container.audio audio { + width: 100%; +} + .play-button { position: absolute; width: 100%; @@ -130,6 +138,11 @@ header .author { color: white; } +.embedded-link .link-description { + margin-top: 3px; + font-size: 14px; +} + .poll-question { font-size: 20px; padding: 3px 0px 3px 0px; From b65ad4393eb49ee9536252145b6dd9e11070027c Mon Sep 17 00:00:00 2001 From: Sal Hagen Date: Wed, 17 Jul 2024 23:10:28 +0200 Subject: [PATCH 099/204] Typo in pagination --- webtool/templates/explorer/pagination.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webtool/templates/explorer/pagination.html b/webtool/templates/explorer/pagination.html index 2161f22bd..1dbb4f05e 100644 --- a/webtool/templates/explorer/pagination.html +++ b/webtool/templates/explorer/pagination.html @@ -50,7 +50,7 @@ {# Show upper 'edge' pages #} {% elif upper_bound - 2 <= page <= upper_bound %} {% for i in range(page - 1, upper_bound) %} -
  • {% if page == current_page %}{{ page }}{% else %}">{{ i - 1 }}{% endif %}
  • +
  • {% if page == current_page %}{{ page }}{% else %}{{ i - 1 }}{% endif %}
  • {% endfor %} {% endif %} From e1d25da3f9ed9bb9f3314381c5a505eb8edc9292 Mon Sep 17 00:00:00 2001 From: Sal Hagen Date: Wed, 17 Jul 2024 23:10:47 +0200 Subject: [PATCH 100/204] Add video to Tumblr Template --- .../explorer/datasource-templates/tumblr.html | 29 ++++++++++--------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/webtool/templates/explorer/datasource-templates/tumblr.html b/webtool/templates/explorer/datasource-templates/tumblr.html index 09f9d06a2..bfec84253 100644 --- a/webtool/templates/explorer/datasource-templates/tumblr.html +++ b/webtool/templates/explorer/datasource-templates/tumblr.html @@ -41,18 +41,18 @@ {% set block_counts = namespace({'text': 0, 'image': 0, 'video': 0, 'audio': 0, 'link': 0}) %} {% for block in post.content_order.split(",") %} - {% if block == "text" %} -

    {{ post.get("body_markdown").split("\n")[block_counts.text] | social_mediafy(datasource='tumblr') | safe }}

    + +

    {{ post.get("body_markdown").split("\n")[block_counts.text] | markdown | social_mediafy(datasource='tumblr') | safe }}

    {% set block_counts.text = block_counts.text + 1 %} - {% elif block == "image" %} +
    {% set block_counts.image = block_counts.image + 1 %} - {% elif block == "video" %} + {% set block_counts.video = block_counts.video + 1 %} - {% elif block == "audio" %} +
    - -
    -
    +
    {% set block_counts.audio = block_counts.audio + 1 %} - {% elif block == "link" %} - {% set url = post.linked_urls.split(",")[block_counts.link] %} - {% set url_title = post.linked_urls_titles.split(",")[block_counts.link] %} - + + {% set url = post.link_urls.split(",")[block_counts.link] %} + {% set link_title = post.link_titles.split(",")[block_counts.link] %} + {% set link_description = post.link_descriptions.split(",")[block_counts.link] %} + {% set block_counts.link = block_counts.link + 1 %} - {% elif block == "poll" %} +
    {{ post["poll_question"] }}
    @@ -142,5 +144,4 @@ {% endif %}
    {% endif %} - \ No newline at end of file From 70e0c9bcecaec67b5268808690ab7502f85ec2c2 Mon Sep 17 00:00:00 2001 From: Sal Hagen Date: Wed, 17 Jul 2024 23:11:10 +0200 Subject: [PATCH 101/204] Be more honest with errors --- webtool/views/views_explorer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webtool/views/views_explorer.py b/webtool/views/views_explorer.py index 61fb467e6..18d657387 100644 --- a/webtool/views/views_explorer.py +++ b/webtool/views/views_explorer.py @@ -118,7 +118,7 @@ def explorer_dataset(key, page=1): break if not posts: - return error(404, error="No posts available for this datasource") + return error(404, error="No posts or posts could not be displayed") # We can use either a generic or a pre-made data source-specific template. template = "datasource" if has_datasource_template(datasource) else "generic" From 95d03f49c05d2f865128aa98b7ec57f2c14f5fb9 Mon Sep 17 00:00:00 2001 From: sal-phd-desktop Date: Tue, 23 Jul 2024 17:42:29 +0200 Subject: [PATCH 102/204] Add more layout options for Tumblr --- datasources/tumblr/search_tumblr.py | 30 +++++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/datasources/tumblr/search_tumblr.py b/datasources/tumblr/search_tumblr.py index 536981a1f..3ab0cd171 100644 --- a/datasources/tumblr/search_tumblr.py +++ b/datasources/tumblr/search_tumblr.py @@ -2,6 +2,10 @@ Search Tumblr via its API Can fetch posts from specific blogs or with specific hashtags + +For Tumblr API documentation, see https://www.tumblr.com/docs/en/api/v2 +For Neue Post Format documentation, see https://github.com/tumblr/docs/blob/master/npf-spec.md + """ import time @@ -790,6 +794,9 @@ def map_item(post): authors_replied = [] replies = [] + # Keep track of list order + list_order = 1 + # Keep track if blocks belong to another post, # which is stored in `layout`. body_reblogged = [] @@ -881,9 +888,23 @@ def map_item(post): i = int(i) + extra_chars text = text[:i] + insert + text[i:] extra_chars += len(insert) - - if block.get("subtype") == "unordered-list-item": - text = "- " + text + + # Some more 'subtype' formatting + subtype = block.get("subtype") + if subtype: + if subtype == "unordered-list-item": + text = "- " + text + if subtype == "ordered-list-item": + text = list_order + ". " + text + list_order += 1 + elif subtype == "heading1": + text = "#" + + elif subtype == "heading2": + text = "##" + text + elif subtype == "quote": + text = ">" + text + elif subtype == "indented": + text = " " + text raw_text.append(block["text"]) formatted_text.append(text) @@ -896,7 +917,8 @@ def map_item(post): content_order.append(block_type) - # Sometimes the order is reshuffled in the `layout` property... + # Sometimes the order is reshuffled in the `layout` property, + # so we have to correct this. if post.get("layout"): if "type" in post["layout"][0]: if post["layout"][0]["type"] == "rows": From 451f2bb5c1497c3d1f18c53778bc0a5373b7dada Mon Sep 17 00:00:00 2001 From: Sal Hagen Date: Wed, 24 Jul 2024 12:49:20 +0200 Subject: [PATCH 103/204] No post reshuffling after the fact --- datasources/tumblr/search_tumblr.py | 42 ++++++++++++++--------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/datasources/tumblr/search_tumblr.py b/datasources/tumblr/search_tumblr.py index 3ab0cd171..2933279df 100644 --- a/datasources/tumblr/search_tumblr.py +++ b/datasources/tumblr/search_tumblr.py @@ -775,7 +775,6 @@ def map_item(post): """ media_types = ["photo", "video", "audio"] - image_urls = [] video_urls = [] video_thumb_urls = [] @@ -788,15 +787,11 @@ def map_item(post): answers = "" raw_text = [] formatted_text = [] - content_order = [] # To retain the order in which post blocks appear authors_liked = [] authors_reblogged = [] authors_replied = [] replies = [] - # Keep track of list order - list_order = 1 - # Keep track if blocks belong to another post, # which is stored in `layout`. body_reblogged = [] @@ -807,10 +802,25 @@ def map_item(post): reblogged_text_blocks += layout_block["blocks"] author_reblogged = layout_block["attribution"]["blog"]["name"] + ordered_list_count = 1 + + # Sometimes the content order is reshuffled in the `layout` property, + # so we have to follow this. + content_order = [] + blocks = [] + if post.get("layout"): + if "type" in post["layout"][0]: + if post["layout"][0]["type"] == "rows": + for display in post["layout"][0].get("display", []): + content_order.append(display["blocks"][0]) + if not content_order: + content_order = range(len(post["content"])) + # We're getting info as Neue Post Format types, # so we need to loop through and join some content 'blocks'. - for i, block in enumerate(post.get("content", [])): + for i in content_order: + block = post["content"][i] block_type = block["type"] # Image @@ -895,10 +905,10 @@ def map_item(post): if subtype == "unordered-list-item": text = "- " + text if subtype == "ordered-list-item": - text = list_order + ". " + text - list_order += 1 + text = str(ordered_list_count) + ". " + text + ordered_list_count += 1 elif subtype == "heading1": - text = "#" + + text = "#" + text elif subtype == "heading2": text = "##" + text elif subtype == "quote": @@ -915,17 +925,7 @@ def map_item(post): # as it is always put first. continue - content_order.append(block_type) - - # Sometimes the order is reshuffled in the `layout` property, - # so we have to correct this. - if post.get("layout"): - if "type" in post["layout"][0]: - if post["layout"][0]["type"] == "rows": - new_content_order = [] - for i in post["layout"][0].get("display", []): - new_content_order.append(content_order[i["blocks"][0]]) - content_order = new_content_order + blocks.append(block_type) # Add note data for note in post.get("notes", []): @@ -959,7 +959,7 @@ def map_item(post): "body": "\n".join(raw_text), "body_markdown": "\n".join(formatted_text), "body_reblogged": "\n".join(body_reblogged), - "content_order": ",".join(content_order), + "content_order": ",".join(blocks), "author_reblogged": author_reblogged, "tags": ",".join(post.get("tags", "")), "notes": post["note_count"], From 996512d15c5d195c75999a748408b31b0eff5ec5 Mon Sep 17 00:00:00 2001 From: Sal Hagen Date: Wed, 24 Jul 2024 14:06:26 +0200 Subject: [PATCH 104/204] Skip duplicate posts in a better way --- datasources/tumblr/search_tumblr.py | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/datasources/tumblr/search_tumblr.py b/datasources/tumblr/search_tumblr.py index 2933279df..7604f7d46 100644 --- a/datasources/tumblr/search_tumblr.py +++ b/datasources/tumblr/search_tumblr.py @@ -345,22 +345,23 @@ def get_posts_by_tag(self, tag, max_date=None, min_date=None, api_key=None): retries += 1 continue - # Get rid of posts that we already enountered, + # Skip posts that we already enountered, # preventing Tumblr API shenanigans or double posts because of - # time reductions. Make sure it's no odd error string, though. - unseen_posts = [] - for check_post in posts: + # time reductions. Make sure it's no error string, though. + new_posts = [] + for post in posts: # Sometimes the API repsonds just with "meta", "response", or "errors". - if isinstance(check_post, str): - self.dataset.update_status("Couldn't add post:", check_post) + if isinstance(post, str): + self.dataset.update_status("Couldn't add post:", post) retries += 1 break else: retries = 0 - if check_post["id"] not in self.seen_ids: - unseen_posts.append(check_post) + if post["id"] not in self.seen_ids: + self.seen_ids.add(post["id"]) + new_posts.append(post) - posts = unseen_posts + posts = new_posts # For no clear reason, the Tumblr API sometimes provides posts with a higher timestamp than requested. # So we have to prevent this manually. @@ -431,8 +432,6 @@ def get_posts_by_tag(self, tag, max_date=None, min_date=None, api_key=None): time_str = datetime.fromtimestamp(date).strftime("%Y-%m-%d %H:%M:%S") self.dataset.update_status("Time difference of %s spotted, restarting query at %s" % (str(time_dif), time_str,)) - - self.seen_ids.update([post["id"] for post in posts]) posts = [post for post in posts if post["timestamp"] >= date] if posts: all_posts += posts @@ -456,7 +455,6 @@ def get_posts_by_tag(self, tag, max_date=None, min_date=None, api_key=None): if posts: all_posts += posts - self.seen_ids.update([post["id"] for post in posts]) break # We got a new post, so we can reset the retry counts. @@ -466,9 +464,6 @@ def get_posts_by_tag(self, tag, max_date=None, min_date=None, api_key=None): # Add retrieved posts top the main list all_posts += posts - # Add to seen ids - self.seen_ids.update([post["id"] for post in posts]) - # Add time differences and calculate new average time difference all_time_difs += time_difs From 05e5c7bb78b87e140cd3133e7de63efcad972a20 Mon Sep 17 00:00:00 2001 From: Sal Hagen Date: Wed, 24 Jul 2024 14:06:40 +0200 Subject: [PATCH 105/204] Don't hashtagify --- webtool/lib/template_filters.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/webtool/lib/template_filters.py b/webtool/lib/template_filters.py index f04546d1a..f7bfc23a2 100644 --- a/webtool/lib/template_filters.py +++ b/webtool/lib/template_filters.py @@ -201,16 +201,15 @@ def _jinja2_filter_social_mediafy(body, datasource=""): "mention": "https://instagram.com/" }, "tumblr": { - "hashtag": "https://tumblr.com/tagged/", "mention": "https://tumblr.com/", - "markdown": True + "markdown": True # Hashtags aren't linked in the post body }, "linkedin": { "hashtag": "https://linkedin.com/feed/hashtag/?keywords=", "mention": "https://linkedin.com/in/" }, "telegram": { - "markdown": True + "markdown": True } } @@ -232,7 +231,6 @@ def _jinja2_filter_social_mediafy(body, datasource=""): tags = sorted(tags, key=lambda x: len(x), reverse=True) for tag in tags: # Match the string, but not if it's preceded by a >, which indicates that we've already added an anchor tag. - # This avoids problems with repeated substrings (e.g. #Dog and #DogOwners). body = re.sub(r"(?)(" + tag + ")", "%s" % (base_urls[datasource]["hashtag"] + tag[1:], tag), body) # Add @-mention links From 8263ebc31014fd66c2923b9960c8c184a045aaa3 Mon Sep 17 00:00:00 2001 From: Sal Hagen Date: Wed, 24 Jul 2024 16:40:05 +0200 Subject: [PATCH 106/204] Skip duplicate Tumblr posts and format Ask content better --- datasources/tumblr/search_tumblr.py | 127 ++++++++++-------- .../explorer/datasource-templates/tumblr.html | 45 ++++--- 2 files changed, 99 insertions(+), 73 deletions(-) diff --git a/datasources/tumblr/search_tumblr.py b/datasources/tumblr/search_tumblr.py index 7604f7d46..bae3b8878 100644 --- a/datasources/tumblr/search_tumblr.py +++ b/datasources/tumblr/search_tumblr.py @@ -533,27 +533,43 @@ def get_posts_by_blog(self, blog, max_date=None, min_date=None): self.dataset.update_status("No posts returned by Tumblr - checking whether this is really all (retry %s/48)" % str(retries)) continue - # Append posts to main list - else: + # Skip posts that we already enountered, + # preventing Tumblr API shenanigans or double posts because of + # time reductions. Make sure it's no error string, though. + new_posts = [] + for post in posts: + # Sometimes the API repsonds just with "meta", "response", or "errors". + if isinstance(post, str): + self.dataset.update_status("Couldn't add post:", post) + retries += 1 + break + else: + retries = 0 + if post["id"] not in self.seen_ids: + self.seen_ids.add(post["id"]) + new_posts.append(post) - # Get the lowest date - max_date = sorted([post["timestamp"] for post in posts])[0] + posts = new_posts - # Manually check if we have a lower date than the min date (`min_date`) already. - # This functonality is not natively supported by Tumblr. - if min_date: - if max_date < min_date: + # Append posts to main list + # Get the lowest date + max_date = sorted([post["timestamp"] for post in posts])[0] - # Get rid of all the posts that are earlier than the max_date timestamp - posts = [post for post in posts if post["timestamp"] >= min_date] + # Manually check if we have a lower date than the min date (`min_date`) already. + # This functonality is not natively supported by Tumblr. + if min_date: + if max_date < min_date: - if posts: - all_posts += posts - break + # Get rid of all the posts that are earlier than the max_date timestamp + posts = [post for post in posts if post["timestamp"] >= min_date] - retries = 0 + if posts: + all_posts += posts + break - all_posts += posts + retries = 0 + + all_posts += posts if len(all_posts) >= self.max_posts: self.max_posts_reached = True @@ -782,20 +798,12 @@ def map_item(post): answers = "" raw_text = [] formatted_text = [] + body_asked = [] + author_asked = "" authors_liked = [] - authors_reblogged = [] authors_replied = [] replies = [] - - # Keep track if blocks belong to another post, - # which is stored in `layout`. - body_reblogged = [] - reblogged_text_blocks = [] - author_reblogged = "" - for layout_block in post.get("layout", []): - if layout_block["type"] == "ask": - reblogged_text_blocks += layout_block["blocks"] - author_reblogged = layout_block["attribution"]["blog"]["name"] + unknown_blocks = [] ordered_list_count = 1 @@ -811,6 +819,13 @@ def map_item(post): if not content_order: content_order = range(len(post["content"])) + # Some text blocks are 'ask' blocks + ask_blocks = [] + for layout_block in post.get("layout", []): + if layout_block["type"] == "ask": + ask_blocks += layout_block["blocks"] + author_asked = layout_block["attribution"]["blog"]["name"] + # We're getting info as Neue Post Format types, # so we need to loop through and join some content 'blocks'. for i in content_order: @@ -850,10 +865,8 @@ def map_item(post): question = block["question"] answers = ",".join([a["answer_text"] for a in block["answers"]]) - # Text - # Here we're adding Markdown formatting. - # We skip text that is part of a reblogged post. - elif block_type == "text" and i not in reblogged_text_blocks: + # Text; we're adding Markdown formatting. + elif block_type == "text": text = block["text"] @@ -872,26 +885,26 @@ def map_item(post): s = fmt["start"] e = fmt["end"] - opening = True # So we know if the styles need to be appended or prepended - for i in [s, e]: - insert_indexes.add(i) - i = str(i) - if i not in inserts: - inserts[i] = "" + opening = True # To know if styles need to be appended or prepended + for n in [s, e]: + insert_indexes.add(n) + n = str(n) + if n not in inserts: + inserts[n] = "" if fmt_type == "link" and opening: - inserts[i] = inserts[i] + "[" + inserts[n] = inserts[n] + "[" elif fmt_type == "link" and not opening: - inserts[i] = "](" + fmt["url"] + ")" + inserts[i] + inserts[n] = "](" + fmt["url"] + ")" + inserts[n] elif fmt_type == "italic": - inserts[i] = "*" + inserts[i] if opening else inserts[i] + "*" + inserts[n] = "*" + inserts[n] if opening else inserts[n] + "*" elif fmt_type == "bold": - inserts[i] = "**" + inserts[i] if opening else inserts[i] + "**" + inserts[n] = "**" + inserts[n] if opening else inserts[n] + "**" opening = False if inserts: extra_chars = 0 - for i, insert in inserts.items(): - i = int(i) + extra_chars - text = text[:i] + insert + text[i:] + for n, insert in inserts.items(): + n = int(n) + extra_chars + text = text[:n] + insert + text[n:] extra_chars += len(insert) # Some more 'subtype' formatting @@ -911,14 +924,18 @@ def map_item(post): elif subtype == "indented": text = " " + text - raw_text.append(block["text"]) - formatted_text.append(text) + # If it's an ask text, we're storing it in + # a different column + if i in ask_blocks: + block_type = "ask" + body_asked.append(block["text"]) + else: + raw_text.append(block["text"]) + formatted_text.append(text) - elif block_type == "text" and i in reblogged_text_blocks: - body_reblogged.append(block["text"]) - # Reblogged text is not considered as an ordered post block, - # as it is always put first. - continue + # Unknown block; can be a third-party app + else: + unknown_blocks.append(json.dumps(block)) blocks.append(block_type) @@ -953,15 +970,14 @@ def map_item(post): "is_reblog": True if post.get("original_type") == "note" else "", "body": "\n".join(raw_text), "body_markdown": "\n".join(formatted_text), - "body_reblogged": "\n".join(body_reblogged), + "body_asked": "\n".join(body_asked), + "author_asked": author_asked, "content_order": ",".join(blocks), - "author_reblogged": author_reblogged, "tags": ",".join(post.get("tags", "")), "notes": post["note_count"], "like_count": len(authors_liked), "authors_liked": ",".join(authors_liked), - "reblog_count": len(authors_reblogged), - "authors_reblogged": ",".join(authors_reblogged), + #"reblog_count": len(authors_reblogged), "reply_count": len(authors_replied), "authors_replied": ",".join(authors_replied), "replies": "\n\n".join(replies), @@ -974,7 +990,8 @@ def map_item(post): "audio_urls": ",".join(audio_urls), "audio_artist": ",".join(audio_artists), "poll_question": question, - "poll_answers": answers + "poll_answers": answers, + "unknown_blocks": "\n".join(unknown_blocks) }) def after_process(self): diff --git a/webtool/templates/explorer/datasource-templates/tumblr.html b/webtool/templates/explorer/datasource-templates/tumblr.html index bfec84253..67ff9b161 100644 --- a/webtool/templates/explorer/datasource-templates/tumblr.html +++ b/webtool/templates/explorer/datasource-templates/tumblr.html @@ -22,25 +22,12 @@
    - - {% if post.get("body_reblogged") %} -
    -
    -
    {% if not pseudonymised %}{{ post["author_reblogged"] }}{% else %}{% endif %}
    -
    {{ post["body_reblogged"] }}
    -
    -
    -
    - {% if not pseudonymised %} - - {% endif %} -
    - {% endif %} - {% set block_counts = namespace({'text': 0, 'image': 0, 'video': 0, 'audio': 0, 'link': 0}) %} - {% for block in post.content_order.split(",") %} + {% set block_counts = namespace({'text': 0, 'image': 0, 'video': 0, 'audio': 0, 'link': 0, 'ask': 0}) %} + {% set content_order = post.content_order.split(",") %} + {% for block in content_order %} {% if block == "text" %}

    {{ post.get("body_markdown").split("\n")[block_counts.text] | markdown | social_mediafy(datasource='tumblr') | safe }}

    @@ -71,10 +58,12 @@ {% set url = post.link_urls.split(",")[block_counts.link] %} {% set link_title = post.link_titles.split(",")[block_counts.link] %} {% set link_description = post.link_descriptions.split(",")[block_counts.link] %} - + + {% elif block == "ask" %} + {% set start_ask_block = True if loop.index == 0 or content_order[loop.index - 2] != "ask" else False %} + {% set end_ask_block = True if loop.index == content_order|length or content_order[loop.index] != "ask" else False %} + {% if start_ask_block %} +
    +
    +
    {% if not pseudonymised %}{{ post["author_asked"] }}{% else %}{% endif %} asked:
    + {% endif %} +

    {{ post.get("body_asked").split("\n")[block_counts.ask] | markdown | social_mediafy(datasource='tumblr') | safe }}

    + {% if end_ask_block %} +
    +
    +
    + {% if not pseudonymised %} + + {% endif %} +
    + {% endif %} + {% set block_counts.ask = block_counts.ask + 1 %} {% endif %} {% endfor %} From df7185a6ef85dbaf1b4c95d1d8db9d5824a168eb Mon Sep 17 00:00:00 2001 From: Sal Hagen Date: Fri, 26 Jul 2024 11:48:45 +0200 Subject: [PATCH 107/204] More Tumblr Explorer templating --- webtool/static/css/explorer/tumblr.css | 51 ++++++++++++++----- .../explorer/datasource-templates/tumblr.html | 20 ++++++-- 2 files changed, 55 insertions(+), 16 deletions(-) diff --git a/webtool/static/css/explorer/tumblr.css b/webtool/static/css/explorer/tumblr.css index e1299086b..baeab6eac 100644 --- a/webtool/static/css/explorer/tumblr.css +++ b/webtool/static/css/explorer/tumblr.css @@ -68,7 +68,7 @@ header .author { .media-container.video img { min-height: 300px; - width: auto; + width: 100%; filter: blur(1.5rem); } @@ -108,37 +108,62 @@ header .author { padding: 3px 0px 3px 0px; } -.reblogged-content { + +.post-content h1 { + font-size: 20px; + font-weight: bold; + background: none; + text-align: left; + color: black; +} + +.post-content h2 { + font-size: 16px; + font-weight: bold; +} + +.ask-content { margin-bottom: 19px; display: inline-block; - max-width: 400px; + max-width: 450px; +} + +.ask-content .body-ask { padding: 25px; background-color: #ededed; } -.author-reblogged { +.ask-content p { + margin: 5px 0px 5px 0px; +} + +.ask-content { +} + +.author-ask { padding-bottom: 3px; } -.author-reblogged-avatar { +.author-ask-avatar { display: inline-block; + vertical-align: top; +} + +a.embedded-link:hover { + text-decoration: none; } -.embedded-link { +.embedded-link-box { padding: 30px; background-color: #001935; color: white; text-align: center; font-size: 18px; - border-radius: 15px; - margin-bottom: 19px; -} - -.embedded-link a { - color: white; + border-radius: 5px; + margin: 19px 0px 19px 0px; } -.embedded-link .link-description { +.embedded-link-box .link-description { margin-top: 3px; font-size: 14px; } diff --git a/webtool/templates/explorer/datasource-templates/tumblr.html b/webtool/templates/explorer/datasource-templates/tumblr.html index 67ff9b161..3866f3c70 100644 --- a/webtool/templates/explorer/datasource-templates/tumblr.html +++ b/webtool/templates/explorer/datasource-templates/tumblr.html @@ -21,6 +21,20 @@ + +{% if post.author_trail %} + +{% for reblog_author in post.author_trail.split(",") %} +
    + {{ reblog_author }} +

    + {{ post.body_reblogged.split("\n\n")[loop.index - 1] }} +

    +
    +{% endfor %} + +{% endif %} +
    @@ -83,15 +97,15 @@ {% if start_ask_block %}
    -
    {% if not pseudonymised %}{{ post["author_asked"] }}{% else %}{% endif %} asked:
    +
    {% if not pseudonymised %}{{ post["author_ask"] }}{% else %}{% endif %} asked:
    {% endif %} -

    {{ post.get("body_asked").split("\n")[block_counts.ask] | markdown | social_mediafy(datasource='tumblr') | safe }}

    +

    {{ post.get("body_ask").split("\n")[block_counts.ask] | markdown | social_mediafy(datasource='tumblr') | safe }}

    {% if end_ask_block %}
    {% if not pseudonymised %} - + {% endif %}
    {% endif %} From f6858badf28e3a661e589c3e229c594085efe03a Mon Sep 17 00:00:00 2001 From: Sal Hagen Date: Fri, 26 Jul 2024 11:48:56 +0200 Subject: [PATCH 108/204] Revamp Tumblr search v0.5 --- datasources/tumblr/search_tumblr.py | 541 ++++++++++++++++++---------- 1 file changed, 346 insertions(+), 195 deletions(-) diff --git a/datasources/tumblr/search_tumblr.py b/datasources/tumblr/search_tumblr.py index bae3b8878..f2ad00dc5 100644 --- a/datasources/tumblr/search_tumblr.py +++ b/datasources/tumblr/search_tumblr.py @@ -94,32 +94,75 @@ def get_options(cls, parent_dataset=None, user=None): "intro": { "type": UserInput.OPTION_INFO, "help": "Retrieve any kind of Tumblr posts with specific tags or from specific blogs. Gets 100.000 posts " - "at max. Insert tags or names of blogs, one on each line. You may insert up to ten tags or " - "blogs.\n\nTumblr tags may include whitespace and commas. A `#` before the tag is optional.\n\n" - "Tag search only get posts explicitly associated with the exact tag you insert here. Querying " - "`gogh` will thus not get posts only tagged with `van gogh`. Keyword search is not " - "allowed by the [Tumblr API](https://api.tumblr.com).\n\nIf this 4CAT reached its Tumblr API rate " - "limit, try again 24 hours later." - }, - "search_scope": { - "type": UserInput.OPTION_CHOICE, - "help": "Search by", - "options": { - "tag": "Tag", - "blog": "Blog" - }, - "default": "tag" + "at max. You may insert up to ten tags or blogs.\n\n" + "Blog-level search also returns reblogs. *Tag-level search only returns original posts*. " + "Reblogs of tagged posts can be retrieved via the options below.\n\n" + "Tag search only get posts with the exact tag you insert. Querying " + "`gogh` will thus not get posts only tagged with `van gogh`.\n\n" + "A `#` before a tag is optional. Blog names must start with `@`.\n\n" + "Individual posts can be retrieved through the format `@blogname:post_id`.\n\n" + "Keyword search is not allowed by the [Tumblr API](https://api.tumblr.com).\n\n" + "If this 4CAT reached its Tumblr API rate limit, try again 24 hours later." }, "query": { "type": UserInput.OPTION_TEXT_LARGE, - "help": "Tags/blogs", - "tooltip": "Separate with commas or new lines." + "help": "Tags and/or blogs", + "tooltip": "E.g. #research tools, #digitalmethods, @the4catblog, @the4catblog:12347714095" }, "get_notes": { "type": UserInput.OPTION_TOGGLE, - "help": "Get post notes (warning: slow)", - "tooltip": "Also retrieve post notes. Likes and replies are added to the original post. Text reblogs are added as new posts.", + "help": "Add note and reblog data (warning: slow)", + "tooltip": "Add post note data for every post. This includes note metrics (likes, replies, reblogs), " + "replies, and reblogged text. " + "Blog-level search includes reblogged text by default." + "This also allows adding reblogs as new posts", "default": False + }, + "get_reblogs": { + "type": UserInput.OPTION_TOGGLE, + "help": "Add reblogs of collected posts", + "tooltip": "Also include posts that reblogged the posts captured in the initial query. " + "Limited to 1,000 reblogs per post.", + "default": False, + "requires": "get_notes" + }, + "reblog_crawl_depth": { + "type": UserInput.OPTION_TEXT, + "help": "Reblog crawl depth", + "tooltip": "How many levels of reblogs to follow; e.g. a value of 2 adds every reblog " + "of the initial post, but also reblogs of these reblogs.", + "default": "1", + "requires": "get_reblogs", + "requires": "get_notes" + }, + "follow_reblogs": { + "type": UserInput.OPTION_TOGGLE, + "help": "Add posts reblogged by collected posts", + "tooltip": "Also include posts that were reblogged by the posts captured in the initial query. " + "This adds the entire reblog 'trail' from the initial post to the collected post. " + "Only affects blog-level search; tag search only gets original posts.", + "default": False, + "requires": "get_notes" + }, + "reblog_type": { + "type": UserInput.OPTION_CHOICE, + "help": "Reblogs to include", + "options": { + "all": "All (also 'empty' reblogs)", + "hashtag_or_text": "Only with added hashtags and/or added text", + "hashtag": "Only with added hashtags", + "text": "Only with added text" + }, + "tooltip": "What type of reblogs to add to the dataset.", + "default": "hashtag_or_text", + "requires": "get_notes" + }, + "reblog_outside_daterange": { + "type": UserInput.OPTION_TOGGLE, + "help": "Allow reblogs and reblogged posts outside of date range", + "default": False, + "tooltip": "Whether to keep reblogs or reblogged posts that fall outside the date range limits inserted below.", + "requires": "get_notes" } } @@ -129,10 +172,10 @@ def get_options(cls, parent_dataset=None, user=None): # No 4CAT set keys for user; let user input their own options["key-info"] = { "type": UserInput.OPTION_INFO, - "help": "In order to access the Tumblr API, you need to register an application. You can do so " - "[here](https://www.tumblr.com/oauth/apps) and use the keys below. You will first get the OAuth " + "help": "To access the Tumblr API, you need to register an application. You can do so " + "[here](https://www.tumblr.com/oauth/apps). You will first get the OAuth " "Consumer Key and Secret, and then the User Token Key and Secret [after entering them here](ht" - "tps://api.tumblr.com/console/calls/user/info) and granting access." + "tps://api.tumblr.com/console/calls/user/info) and granting access." } options["consumer_key"] = { "type": UserInput.OPTION_TEXT, @@ -187,15 +230,20 @@ def get_items(self, query): # ready our parameters parameters = self.dataset.get_parameters() - scope = parameters.get("search_scope", "") - queries = parameters.get("query").split(", ") + queries = re.split(",|\n", parameters.get("query", "")) get_notes = parameters.get("get_notes", False) + get_reblogs = parameters.get("get_reblogs", False) + reblog_crawl_depth = parameters.get("reblog_crawl_depth", 0) + follow_reblogs = parameters.get("follow_reblogs", False) + reblog_type = parameters.get("reblog_type", "hashtag_or_text") + reblog_outside_daterange = parameters.get("reblog_outside_daterange", False) # Store all info here results = [] - # Store all notes from posts by blogs here - all_notes = [] + # Blog names and post IDs of extra posts we need to fetch + # (e.g. in the reblog trail or posts that reblog captured posts) + extra_posts = set() # Get date parameters min_date = parameters.get("min_date", None) @@ -224,8 +272,23 @@ def get_items(self, query): # for each tag or blog, get post for query in queries: - # Get posts per tag - if scope == "tag": + query = query.strip() + + if query.startswith("@"): + blog_name = query[1:] + + # Get a possible post ID + post_id = None + if ":" in query: + blog_name, post_id = blog_name.split(":") + + new_results = self.get_posts_by_blog(blog_name, post_id=post_id, max_date=max_date, min_date=min_date) + + # Get tagged post + else: + if query.startswith("#"): + query = query[1:] + # Used for getting tagged posts, which uses requests instead. api_key = self.parameters.get("consumer_key") if not api_key: @@ -233,14 +296,6 @@ def get_items(self, query): new_results = self.get_posts_by_tag(query, max_date=max_date, min_date=min_date, api_key=api_key) - # Get posts per blog - elif scope == "blog": - new_results = self.get_posts_by_blog(query, max_date=max_date, min_date=min_date) - - else: - self.dataset.update_status("Invalid scope") - break - results += new_results if self.max_posts_reached: @@ -250,36 +305,88 @@ def get_items(self, query): self.dataset.update_status("API limit reached") break - # Loop through the results once to add note data and fetch text reblogs, - len_results = len(results) # results will change in length when we add reblogs. - for i in range(len_results): + # Get note data. + # Also potentially store reblogs that we want to add to the dataset + if get_notes: + + # Dictionary with the `reblog_key` as key and notes as value. + # Notes are the same for all posts in a reblog chain, + # so we can use this to check whether we already have the data. + retrieved_notes = {} + + for i, post in enumerate(results): - post = results[i] + if self.max_posts_reached: + break + if self.api_limit_reached: + break - # Get note information - if get_notes and not self.max_posts_reached and not self.api_limit_reached: + self.dataset.update_status("Retrieving notes for post %i/%i" % (i, len_results)) - # Reblog information is already returned for blog-level searches - # and is stored as `notes` in the posts themselves. - # Retrieving notes for tag-based posts must be done one-by-one; - # fetching them all at once is not supported by the Tumblr API. - if not "notes" in post: - self.dataset.update_status("Retrieving notes for post %i/%i" % (i, len_results)) + # We may have already encountered this note-chain + # with a different post. + if post["reblog_key"] in retrieved_notes: + notes = retrieved_notes[post["reblog_key"]] + # In the case of posts with just a few notes in blog-level search, + # we may have all the possible notes in the retrieved JSON. + elif len(post["notes"]) == post["note_count"]: + notes = post["notes"] + + # Do some conversion that is also done in get_notes + for note in notes: + + + else: + # We're getting notes in the "conversation" mode to + # prioritise replies and text reblogs. + # Only gets first 1,000 replies/text reblogs. notes = self.get_notes(post["blog_name"], post["id"]) - time.sleep(.2) + time.sleep(.1) + + final_notes = {"notes": notes, + "like_count": notes["like_count"], + "reply_count": notes["reply_count"], + "reblog_count": notes["reblog_count"], + } + + # Add to results + results[i] |= final_notes + retrieved_notes[post["reblog_key"]] = final_notes + + # Get the full data for text reblogs and add them as new posts + if get_reblogs: - if notes: - results[i]["notes"] = notes + for note in final_notes["notes"]: + + if reblog_type == "hashtag_or_text": + + elif reblog_type == "hashtag_or_text": + + elif reblog_type == "text": + + elif reblog_type == "all": + pass + + # Potentially skip new posts outside of the date range + if reblog_outside_daterange and (max_date or min_date): + if not min_date: + if note["timestamp"] >= max_date: + continue + elif not min_date <= note["timestamp"] <= max_date: + continue + + extra_posts.add({"blog": note["blog_name"], "post_id": note["post_id"]}) + + # Check for reblogged posts in the reblog trail + if follow_reblogs: + for result in results: + if result["trail"] + + # Add reblogged posts and reblogs to dataset + for extra_post in extra_posts: + print("add") - # Get the full data for text reblogs and add them as new posts - for note in notes: - if note["type"] == "reblog": - text_reblog = self.get_post_by_id(note["blog_name"], note["post_id"]) - if text_reblog: - results.append(text_reblog) - time.sleep(.2) - self.job.finish() return results @@ -485,17 +592,25 @@ def get_posts_by_tag(self, tag, max_date=None, min_date=None, api_key=None): return all_posts - def get_posts_by_blog(self, blog, max_date=None, min_date=None): + def get_posts_by_blog(self, blog, post_id=None, max_date=None, min_date=None): """ - Get Tumblr posts posts with a certain blog - :param tag, str: the name of the blog you want to look for - :param min_date: a unix timestamp, indicates posts should be min_date this date. + Get Tumblr posts from a certain blog + :param blog, str: the name of the blog you want to look for + :param post_id, str: the post ID (optional) :param max_date: a unix timestamp, indicates posts should be max_date this date. + :param min_date: a unix timestamp, indicates posts should be min_date this date. :returns: a dict created from the JSON response """ + blog = blog + ".tumblr.com" + if post_id: + try: + test_id = int(post_id) + except TypeError: + raise QueryParametersException("Post ID %s is invalid" % post_id) + if not max_date: max_date = int(time.time()) @@ -518,9 +633,19 @@ def get_posts_by_blog(self, blog, max_date=None, min_date=None): try: # Use the pytumblr library to make the API call - posts = self.client.posts(blog, before=max_date, limit=20, reblog_info=True, notes_info=True, filter="raw", npf=True) + posts = self.client.posts(blog, id=post_id, before=max_date, limit=20, notes_info=True, filter="raw", npf=True) posts = posts["posts"] + except ConnectionRefusedError: + retries += 1 + if post_id: + self.failed_posts.append(post_id) + self.dataset.update_status("ConnectionRefused: Unable to collect post %s/%s" % (blog, post_id)) + else: + self.dataset.update_status("ConnectionRefused: Unable to collect posts for blog %s before %s" % (blog, max_date)) + time.sleep(10) + continue + except Exception as e: self.dataset.update_status("Reached the limit of the Tumblr API. Last timestamp: %s" % str(max_date)) self.api_limit_reached = True @@ -551,21 +676,23 @@ def get_posts_by_blog(self, blog, max_date=None, min_date=None): posts = new_posts - # Append posts to main list - # Get the lowest date - max_date = sorted([post["timestamp"] for post in posts])[0] + if not post_id: - # Manually check if we have a lower date than the min date (`min_date`) already. - # This functonality is not natively supported by Tumblr. - if min_date: - if max_date < min_date: + # Append posts to main list + # Get the lowest date + max_date = sorted([post["timestamp"] for post in posts])[0] + + # Manually check if we have a lower date than the min date (`min_date`) already. + # This functonality is not natively supported by Tumblr. + if min_date: + if max_date < min_date: - # Get rid of all the posts that are earlier than the max_date timestamp - posts = [post for post in posts if post["timestamp"] >= min_date] + # Get rid of all the posts that are earlier than the max_date timestamp + posts = [post for post in posts if post["timestamp"] >= min_date] - if posts: - all_posts += posts - break + if posts: + all_posts += posts + break retries = 0 @@ -574,54 +701,14 @@ def get_posts_by_blog(self, blog, max_date=None, min_date=None): if len(all_posts) >= self.max_posts: self.max_posts_reached = True break + if post_id: + break - self.dataset.update_status("Collected %s posts for blog %s" % str(len(all_posts), blog)) + self.dataset.update_status("Collected %s posts for blog %s" % (str(len(all_posts)), blog)) time.sleep(.2) return all_posts - def get_post_by_id(self, blog_name, post_id): - """ - Fetch individual posts - :param blog_name, str: The blog's name - :param id, int: The post ID - - returns result list, a list with a dictionary with the post's information - """ - if self.interrupted: - raise ProcessorInterruptedException("Interrupted while fetching post from Tumblr") - - connection_retries = 0 - - while True: - if connection_retries >= 5: - self.dataset.update_status("Too many connection errors; unable to collect post %s" % post_id) - break - try: - # Request the specific post. - post = self.client.posts(blog_name, id=post_id, npf=True, reblog_info=True, notes_info=True, filter="raw") - - except ConnectionRefusedError: - connection_retries += 1 - self.failed_posts.append(note["id"]) - self.dataset.update_status("ConnectionRefused: Unable to collect reblogs for post %s" % post_id) - time.sleep(10) - continue - - if post: - break - time.sleep(.2) - - # Tumblr API can sometimes return with this kind of error: - # {'meta': {'status': 500, 'msg': 'Server Error'}, 'response': {'error': 'Malformed JSON or HTML was returned.'}} - if not post or "posts" not in post: - return None - - # Get the first element of the list - it's always one post. - result = post["posts"][0] - - return result - def get_notes(self, blog_id, post_id): """ Gets data on the notes of a specific post. @@ -631,6 +718,7 @@ def get_notes(self, blog_id, post_id): :returns: a list with dictionaries of notes. """ + note_metrics = {} post_notes = [] max_date = None @@ -644,6 +732,8 @@ def get_notes(self, blog_id, post_id): max_notes_retries = 10 notes_retries = 0 + first_batch = True + count += 1 if self.interrupted: @@ -658,8 +748,12 @@ def get_notes(self, blog_id, post_id): # Requests a post's notes try: - notes = self.client.notes(blog_id, id=post_id, before_timestamp=max_date) - print(notes) + + # Imprtant: we're getting notes in 'conversation' mode to + # prioritise replies and reblogs that add text. + # We're not interested in the the names of authors that liked the post + # or who reblogged without adding content. + notes = self.client.notes(blog_id, id=post_id, before_timestamp=max_date, mode="conversation") except ConnectionRefusedError: self.dataset.update_status("Couldn't get notes for post %s (ConnectionRefusedError), trying again" % post_id) notes_retries += 1 @@ -675,6 +769,15 @@ def get_notes(self, blog_id, post_id): if "notes" in notes: notes_retries = 0 + # Add some metrics for the first response + if first_batch: + note_metrics = { + "reply_count": notes["total_replies"], + "reblog_count": notes["total_reblogs"], + "like_count": notes["total_likes"] + } + first_batch = False + for note in notes["notes"]: post_notes.append(note) @@ -693,6 +796,8 @@ def get_notes(self, blog_id, post_id): time.sleep(1) continue + post_notes = {"notes": post_notes} | note_metrics + return post_notes @staticmethod @@ -798,10 +903,14 @@ def map_item(post): answers = "" raw_text = [] formatted_text = [] - body_asked = [] - author_asked = "" - authors_liked = [] + authors_reblogged = [] + reblog_trail = [] + body_reblogged = [] + author_trail = [] + body_ask = [] + author_ask = "" authors_replied = [] + like_count = "" replies = [] unknown_blocks = [] @@ -824,7 +933,7 @@ def map_item(post): for layout_block in post.get("layout", []): if layout_block["type"] == "ask": ask_blocks += layout_block["blocks"] - author_asked = layout_block["attribution"]["blog"]["name"] + author_ask = layout_block["attribution"]["blog"]["name"] # We're getting info as Neue Post Format types, # so we need to loop through and join some content 'blocks'. @@ -868,70 +977,16 @@ def map_item(post): # Text; we're adding Markdown formatting. elif block_type == "text": - text = block["text"] - - if block.get("formatting"): - - # Dict with index numbers as keys where inserts need to be made, - # and the replacement strings as values. Done this way so we know - # when multiple formatting operations need to be made at the same - # index position. - insert_indexes = set() - inserts = {} - - for fmt in block["formatting"]: - fmt_type = fmt["type"] - if fmt["type"] in ("link", "bold", "italic"): - s = fmt["start"] - e = fmt["end"] - - opening = True # To know if styles need to be appended or prepended - for n in [s, e]: - insert_indexes.add(n) - n = str(n) - if n not in inserts: - inserts[n] = "" - if fmt_type == "link" and opening: - inserts[n] = inserts[n] + "[" - elif fmt_type == "link" and not opening: - inserts[n] = "](" + fmt["url"] + ")" + inserts[n] - elif fmt_type == "italic": - inserts[n] = "*" + inserts[n] if opening else inserts[n] + "*" - elif fmt_type == "bold": - inserts[n] = "**" + inserts[n] if opening else inserts[n] + "**" - opening = False - if inserts: - extra_chars = 0 - for n, insert in inserts.items(): - n = int(n) + extra_chars - text = text[:n] + insert + text[n:] - extra_chars += len(insert) - - # Some more 'subtype' formatting - subtype = block.get("subtype") - if subtype: - if subtype == "unordered-list-item": - text = "- " + text - if subtype == "ordered-list-item": - text = str(ordered_list_count) + ". " + text - ordered_list_count += 1 - elif subtype == "heading1": - text = "#" + text - elif subtype == "heading2": - text = "##" + text - elif subtype == "quote": - text = ">" + text - elif subtype == "indented": - text = " " + text + md_text = SearchTumblr.format_tumblr_text(block) # If it's an ask text, we're storing it in # a different column if i in ask_blocks: block_type = "ask" - body_asked.append(block["text"]) + body_ask.append(block["text"]) else: raw_text.append(block["text"]) - formatted_text.append(text) + formatted_text.append(md_text) # Unknown block; can be a third-party app else: @@ -939,13 +994,16 @@ def map_item(post): blocks.append(block_type) - # Add note data + # Parse note data for note in post.get("notes", []): + if note["type"] == "like": - # Inserting at the start of the list to maintain chronological order. - authors_liked.insert(0, note["blog_name"]) - elif note["type"] in ("posted", "reblog"): - # If the original post is a text reblog, it will also show up in the notes. + if isinstance(like_count, str): + like_count = 0 + like_count += 1 + + if note["type"] in ("posted", "reblog"): + # If the post is a text reblog, it will also show up in the notes. # We can skip these since the data is already in the main post dict. if note["blog_name"] != post["blog_name"] and note["timestamp"] != post["timestamp"]: authors_reblogged.insert(0, note["blog_name"]) @@ -953,9 +1011,31 @@ def map_item(post): authors_replied.insert(0, note["blog_name"]) replies.insert(0, note["blog_name"] + ": " + note["reply_text"]) + # The API sometimes gives back a 'trail' of reblogged content + # This includes reblogged content, but it's not entirely complete (e.g. no hashtags) + # so we'll only store the original blog name and its text content. + for i, reblog in enumerate(post.get("trail", [])): + + reblogged_text = [] + + if "broken_blog_name" in reblog: + reblog_author = reblog["broken_blog_name"] + else: + reblog_author = reblog["blog"]["name"] + + for reblog_block in reblog.get("content", []): + if reblog_block["type"] == "text": + reblogged_text.append(reblog_block["text"]) + + if not reblogged_text: + reblogged_text = "" + body_reblogged.append("\n".join(reblogged_text)) + + author_trail.append(reblog_author) + return MappedItem({ "type": post["original_type"] if "original_type" in post else post["type"], - "id": post["id"], + "id": post["id"] if "id" in post else post["post"]["id"], "author": post["blog_name"], "author_avatar_url": "https://api.tumblr.com/v2/blog/" + post["blog_name"] + "/avatar", "thread_id": post["reblog_key"], @@ -967,18 +1047,21 @@ def map_item(post): "author_last_updated": post["blog"]["updated"], "post_url": post["post_url"], "post_slug": post["slug"], - "is_reblog": True if post.get("original_type") == "note" else "", + "is_reblog": True if post.get("parent_post_url") else "", + "reblog_key": post["reblog_key"], "body": "\n".join(raw_text), "body_markdown": "\n".join(formatted_text), - "body_asked": "\n".join(body_asked), - "author_asked": author_asked, + "body_reblogged": "\n\n".join(body_reblogged), + "author_trail": ",".join(author_trail), + "parent_post_url": post.get("parent_post_url", ""), + "authors_reblogged": ",".join(authors_reblogged), + "body_ask": "\n".join(body_ask), + "author_ask": author_ask, "content_order": ",".join(blocks), "tags": ",".join(post.get("tags", "")), "notes": post["note_count"], - "like_count": len(authors_liked), - "authors_liked": ",".join(authors_liked), - #"reblog_count": len(authors_reblogged), - "reply_count": len(authors_replied), + "like_count": like_count, + "reply_count": len(authors_replied) if authors_replied else "", "authors_replied": ",".join(authors_replied), "replies": "\n\n".join(replies), "link_urls": ",".join(link_urls), @@ -994,6 +1077,74 @@ def map_item(post): "unknown_blocks": "\n".join(unknown_blocks) }) + @staticmethod + def format_tumblr_text(text_content): + """ + Format text content according to Tumblr's Neue Post Format definition. + + :param content, list: The list of content as returned by the Tumblr API (can also be part of a `trail`) + :returns dict + + """ + + text = text_content["text"] + + if text_content.get("formatting"): + + # Dict with index numbers as keys where inserts need to be made, + # and the replacement strings as values. Done this way so we know + # when multiple formatting operations need to be made at the same + # index position. + insert_indexes = set() + inserts = {} + + for fmt in text_content["formatting"]: + fmt_type = fmt["type"] + if fmt["type"] in ("link", "bold", "italic"): + s = fmt["start"] + e = fmt["end"] + + opening = True # To know if styles need to be appended or prepended + for n in [s, e]: + insert_indexes.add(n) + n = str(n) + if n not in inserts: + inserts[n] = "" + if fmt_type == "link" and opening: + inserts[n] = inserts[n] + "[" + elif fmt_type == "link" and not opening: + inserts[n] = "](" + fmt["url"] + ")" + inserts[n] + elif fmt_type == "italic": + inserts[n] = "*" + inserts[n] if opening else inserts[n] + "*" + elif fmt_type == "bold": + inserts[n] = "**" + inserts[n] if opening else inserts[n] + "**" + opening = False + if inserts: + extra_chars = 0 + for n, insert in inserts.items(): + n = int(n) + extra_chars + text = text[:n] + insert + text[n:] + extra_chars += len(insert) + + # Some more 'subtype' formatting + subtype = text_content.get("subtype") + if subtype: + if subtype == "unordered-list-item": + text = "- " + text + if subtype == "ordered-list-item": + text = str(ordered_list_count) + ". " + text + ordered_list_count += 1 + elif subtype == "heading1": + text = "#" + text + elif subtype == "heading2": + text = "##" + text + elif subtype == "quote": + text = ">" + text + elif subtype == "indented": + text = " " + text + + return text + def after_process(self): """ Override of the same function in processor.py From 7d9df2d1f585d7b55ccca4bf5b5d5dd30b0b9126 Mon Sep 17 00:00:00 2001 From: sal-phd-desktop Date: Fri, 26 Jul 2024 17:57:16 +0200 Subject: [PATCH 109/204] Improve Tumblr querying --- datasources/tumblr/search_tumblr.py | 272 ++++++++++++++++------------ 1 file changed, 157 insertions(+), 115 deletions(-) diff --git a/datasources/tumblr/search_tumblr.py b/datasources/tumblr/search_tumblr.py index f2ad00dc5..d4997989b 100644 --- a/datasources/tumblr/search_tumblr.py +++ b/datasources/tumblr/search_tumblr.py @@ -107,62 +107,49 @@ def get_options(cls, parent_dataset=None, user=None): "query": { "type": UserInput.OPTION_TEXT_LARGE, "help": "Tags and/or blogs", - "tooltip": "E.g. #research tools, #digitalmethods, @the4catblog, @the4catblog:12347714095" + "tooltip": "E.g. #research tools, @4catblog, @4catblog:12347714095" }, "get_notes": { "type": UserInput.OPTION_TOGGLE, "help": "Add note and reblog data (warning: slow)", - "tooltip": "Add post note data for every post. This includes note metrics (likes, replies, reblogs), " + "tooltip": "Add post and reblog note data for every post. This includes note metrics (likes, replies, reblogs), " "replies, and reblogged text. " - "Blog-level search includes reblogged text by default." - "This also allows adding reblogs as new posts", + "Blog-level search includes reblogged text by default. " + "Enables adding reblogs as new posts", "default": False }, "get_reblogs": { "type": UserInput.OPTION_TOGGLE, "help": "Add reblogs of collected posts", - "tooltip": "Also include posts that reblogged the posts captured in the initial query. " + "tooltip": "Add posts that reblogged posts from the initial query to the dataset. " "Limited to 1,000 reblogs per post.", - "default": False, - "requires": "get_notes" - }, - "reblog_crawl_depth": { - "type": UserInput.OPTION_TEXT, - "help": "Reblog crawl depth", - "tooltip": "How many levels of reblogs to follow; e.g. a value of 2 adds every reblog " - "of the initial post, but also reblogs of these reblogs.", - "default": "1", - "requires": "get_reblogs", - "requires": "get_notes" + "requires": "get_notes==true", + "default": False }, - "follow_reblogs": { - "type": UserInput.OPTION_TOGGLE, - "help": "Add posts reblogged by collected posts", - "tooltip": "Also include posts that were reblogged by the posts captured in the initial query. " - "This adds the entire reblog 'trail' from the initial post to the collected post. " - "Only affects blog-level search; tag search only gets original posts.", - "default": False, - "requires": "get_notes" - }, "reblog_type": { "type": UserInput.OPTION_CHOICE, - "help": "Reblogs to include", + "help": "Reblogs to add", "options": { - "all": "All (also 'empty' reblogs)", - "hashtag_or_text": "Only with added hashtags and/or added text", - "hashtag": "Only with added hashtags", - "text": "Only with added text" + "text": "Only with added text", + "tag_or_text": "Only with added text and/or added hashtags" }, "tooltip": "What type of reblogs to add to the dataset.", - "default": "hashtag_or_text", - "requires": "get_notes" + "requires": "get_reblogs==true", + "default": "text" }, + "follow_reblogs": { + "type": UserInput.OPTION_TOGGLE, + "help": "Add posts reblogged by collected posts", + "tooltip": "Also include all posts that were reblogged by the posts captured in the initial query. " + "This adds the entire reblog 'trail' from the initial post to the collected post. " + "Only affects blog-level search; tag-level search only gets original posts.", + "default": False + }, "reblog_outside_daterange": { "type": UserInput.OPTION_TOGGLE, "help": "Allow reblogs and reblogged posts outside of date range", - "default": False, - "tooltip": "Whether to keep reblogs or reblogged posts that fall outside the date range limits inserted below.", - "requires": "get_notes" + "tooltip": "Whether to keep reblogs or reblogged posts that fall outside the optional date range limit inserted below.", + "default": True } } @@ -235,7 +222,7 @@ def get_items(self, query): get_reblogs = parameters.get("get_reblogs", False) reblog_crawl_depth = parameters.get("reblog_crawl_depth", 0) follow_reblogs = parameters.get("follow_reblogs", False) - reblog_type = parameters.get("reblog_type", "hashtag_or_text") + reblog_type = parameters.get("reblog_type", False) reblog_outside_daterange = parameters.get("reblog_outside_daterange", False) # Store all info here @@ -243,7 +230,7 @@ def get_items(self, query): # Blog names and post IDs of extra posts we need to fetch # (e.g. in the reblog trail or posts that reblog captured posts) - extra_posts = set() + extra_posts = [] # Get date parameters min_date = parameters.get("min_date", None) @@ -281,7 +268,6 @@ def get_items(self, query): post_id = None if ":" in query: blog_name, post_id = blog_name.split(":") - new_results = self.get_posts_by_blog(blog_name, post_id=post_id, max_date=max_date, min_date=min_date) # Get tagged post @@ -305,13 +291,25 @@ def get_items(self, query): self.dataset.update_status("API limit reached") break + # Check for reblogged posts in the reblog trail + if follow_reblogs: + for result in results: + # The post rail is stored in the trail list + for trail_post in result.get("trail", []): + # Some posts or blogs have been deleted; skip these + if not "broken_blog_name" in trail_post: + if trail_post["id"] not in self.seen_ids: + extra_posts.add({"blog": trail_post["blog"], "id": trail_post["post"]["id"]}) + # Get note data. - # Also potentially store reblogs that we want to add to the dataset + # Blog-level searches already have some note data, like reblogged text, + # but not everything (like replies), so we're going to retrieve these here as well. + # Also store IDs of reblogs/reblogged posts that we want to add. if get_notes: - # Dictionary with the `reblog_key` as key and notes as value. - # Notes are the same for all posts in a reblog chain, - # so we can use this to check whether we already have the data. + # Create a dictionary with the `reblog_key` as key and notes as value. + # Notes are the same for all posts in a reblog chain. + # This means that we may not have to re-query the same data. retrieved_notes = {} for i, post in enumerate(results): @@ -321,52 +319,74 @@ def get_items(self, query): if self.api_limit_reached: break - self.dataset.update_status("Retrieving notes for post %i/%i" % (i, len_results)) + self.dataset.update_status("Retrieving notes for post %i/%i" % (i+1, len(results))) # We may have already encountered this note-chain # with a different post. if post["reblog_key"] in retrieved_notes: notes = retrieved_notes[post["reblog_key"]] - # In the case of posts with just a few notes in blog-level search, + # In the case of posts with just a few notes, # we may have all the possible notes in the retrieved JSON. elif len(post["notes"]) == post["note_count"]: - notes = post["notes"] - - # Do some conversion that is also done in get_notes - for note in notes: - + # Add some metrics, like done in `get_notes`. + notes = { + "notes": post["notes"], + "reply_count": len([n for n in notes if n["type"] == "reply"]), + "reblog_count": len([n for n in notes if n["type"] == "reblog"]), + "like_count": len([n for n in notes if n["type"] == "like"]) + } else: - # We're getting notes in the "conversation" mode to - # prioritise replies and text reblogs. + # Get notes via the API # Only gets first 1,000 replies/text reblogs. - notes = self.get_notes(post["blog_name"], post["id"]) - time.sleep(.1) - - final_notes = {"notes": notes, - "like_count": notes["like_count"], - "reply_count": notes["reply_count"], - "reblog_count": notes["reblog_count"], - } - # Add to results - results[i] |= final_notes - retrieved_notes[post["reblog_key"]] = final_notes - - # Get the full data for text reblogs and add them as new posts + # We're using different querying modes since + # it'll speed up the process. The fastest is + # `conversation`, which prioritises text reblogs and + # replies, and also provides metrics on like and reblog counts; + # we'll use this as default. However, if the user + # has indicated they want to add reblogs with hashtags, + # we'll also have to use the `reblogs_with_tags` mode. + seen_notes = set() + notes = self.get_notes(post["blog_name"], post["id"], mode="conversation", max_notes=1000) + for note in notes["notes"]: + if note["type"] == "reblog": + seen_notes.add(note["post_id"]) + + # Get tag-only reblogs; these aren't returned in `conversation` mode. + if reblog_type == "tag_or_text": + tag_notes = self.get_notes(post["blog_name"], post["id"], mode="reblogs_with_tags", max_notes=1000) + for tag_note in tag_notes: + if tag_note["post_id"] not in seen_notes: + notes["notes"].append(tag_note) + + # Add to posts + results[i] = {**results[i], **notes} + retrieved_notes[post["reblog_key"]] = notes + + # Get the full data for certain reblogs and add them as new posts if get_reblogs: - for note in final_notes["notes"]: + for note in notes["notes"]: - if reblog_type == "hashtag_or_text": + # Skip replies and likes + if note["type"] != "reblog": + continue - elif reblog_type == "hashtag_or_text": + elif reblog_type == "tag_or_text": + # Skip reblogs without tags or text + if not note.get("tags") and not note.get("added_text"): + continue - elif reblog_type == "text": + elif reblog_type == "text": + # Skip reblogs without added text + if not note.get("added_text"): + continue - elif reblog_type == "all": - pass + # Skip posts that we already collected + if note["post_id"] in self.seen_ids: + continue # Potentially skip new posts outside of the date range if reblog_outside_daterange and (max_date or min_date): @@ -376,23 +396,32 @@ def get_items(self, query): elif not min_date <= note["timestamp"] <= max_date: continue - extra_posts.add({"blog": note["blog_name"], "post_id": note["post_id"]}) - - # Check for reblogged posts in the reblog trail - if follow_reblogs: - for result in results: - if result["trail"] + extra_posts.append({"blog": note["blog_name"], "id": note["post_id"]}) # Add reblogged posts and reblogs to dataset - for extra_post in extra_posts: - print("add") + for i, extra_post in enumerate(extra_posts): + + self.dataset.update_status("Adding %s/%s reblogged posts to the dataset" % (i, len(extra_posts))) + + if extra_post["id"] not in self.seen_ids: + new_post = self.get_posts_by_blog(extra_post["blog"], extra_post["id"]) + + if new_post: + new_post = new_post[0] + + # Add note data; these should already be retrieved above + if get_notes: + new_post = {**new_post, **retrieved_notes[new_post["reblog_key"]]} + + results.append(new_post) + self.seen_ids.add(extra_post["id"]) self.job.finish() return results def get_posts_by_tag(self, tag, max_date=None, min_date=None, api_key=None): """ - Get Tumblr posts posts with a certain tag + Get Tumblr posts posts with a certain tag. :param tag, str: the tag you want to look for :param min_date: a unix timestamp, indicates posts should be min_date this date. :param max_date: a unix timestamp, indicates posts should be max_date this date. @@ -483,7 +512,7 @@ def get_posts_by_tag(self, tag, max_date=None, min_date=None, api_key=None): # self.api_limit_reached = True # break - # Make sure the Tumblr API doesn't magically stop at an earlier date + # Make sure the Tumblr API doesn't magically stop even if earlier posts are available if not posts: date_retries += 1 @@ -548,7 +577,7 @@ def get_posts_by_tag(self, tag, max_date=None, min_date=None, api_key=None): time_difs.append(time_dif) - # To start a new query + # Stop if we found nothing for this query if not posts: break @@ -633,7 +662,7 @@ def get_posts_by_blog(self, blog, post_id=None, max_date=None, min_date=None): try: # Use the pytumblr library to make the API call - posts = self.client.posts(blog, id=post_id, before=max_date, limit=20, notes_info=True, filter="raw", npf=True) + posts = self.client.posts(blog, id=post_id, before=max_date, limit=20, reblog_info=True, notes_info=True, filter="raw", npf=True) posts = posts["posts"] except ConnectionRefusedError: @@ -709,11 +738,14 @@ def get_posts_by_blog(self, blog, post_id=None, max_date=None, min_date=None): return all_posts - def get_notes(self, blog_id, post_id): + def get_notes(self, blog_id, post_id, mode="conversation", max_notes=1000): """ Gets data on the notes of a specific post. - :param blog_id, str: The ID of the blog. - :param post_id, str: The ID of the post. + :param blog_id, str: The ID of the blog. + :param post_id, str: The ID of the post. + :param mode, str: The type of notes that get priority. + `conversation` prioritises text reblogs and replies. + :param mode, max_notes: Maximum amount of notes to return. :returns: a list with dictionaries of notes. """ @@ -733,6 +765,7 @@ def get_notes(self, blog_id, post_id): notes_retries = 0 first_batch = True + note_metrics = {} count += 1 @@ -746,14 +779,14 @@ def get_notes(self, blog_id, post_id): self.failed_posts.append(post_id) break - # Requests a post's notes + # Request a post's notes try: - # Imprtant: we're getting notes in 'conversation' mode to + # Important: we're getting notes in 'conversation' mode to # prioritise replies and reblogs that add text. # We're not interested in the the names of authors that liked the post # or who reblogged without adding content. - notes = self.client.notes(blog_id, id=post_id, before_timestamp=max_date, mode="conversation") + notes = self.client.notes(blog_id, id=post_id, before_timestamp=max_date, mode=mode) except ConnectionRefusedError: self.dataset.update_status("Couldn't get notes for post %s (ConnectionRefusedError), trying again" % post_id) notes_retries += 1 @@ -770,19 +803,31 @@ def get_notes(self, blog_id, post_id): notes_retries = 0 # Add some metrics for the first response - if first_batch: + if first_batch and mode == "conversation": note_metrics = { - "reply_count": notes["total_replies"], + "note_count": notes["total_notes"], "reblog_count": notes["total_reblogs"], - "like_count": notes["total_likes"] + "like_count": notes["total_likes"], + "reply_count": 0 } first_batch = False + # Add notes for note in notes["notes"]: + if mode == "converstaion" and note["type"] == "reply": + note_metrics["reply_count"] += 1 + post_notes.append(note) + # `conversation` mode groups likes and reblogs without commentary + # in the `rollup_notes` list. We're adding reblogs to the post notes. + if mode == "conversation": + if "rollup_notes" in notes and "notes" in notes["rollup_notes"][0]: + for note in notes["rollup_notes"][0]["notes"]: + if note["type"] == "reblog": + post_notes.append(note) + if notes.get("_links"): - print("more notes for " + str(blog_id) + " " + str(post_id)) max_date = notes["_links"]["next"]["query_params"]["before_timestamp"] time.sleep(.2) @@ -796,7 +841,8 @@ def get_notes(self, blog_id, post_id): time.sleep(1) continue - post_notes = {"notes": post_notes} | note_metrics + # Merge notes and note metrics + post_notes = {"notes": post_notes, **note_metrics} return post_notes @@ -827,7 +873,10 @@ def connect_to_tumblr(self): self.client = pytumblr.TumblrRestClient(*config_keys) - client_info = self.client.info() + try: + client_info = self.client.info() + except Exception as e: + raise ConnectionRefusedError("Couldn't connect to Tumblr API, (%s)" % e) # Check if there's any errors if client_info.get("meta"): @@ -906,7 +955,7 @@ def map_item(post): authors_reblogged = [] reblog_trail = [] body_reblogged = [] - author_trail = [] + reblog_trail = [] body_ask = [] author_ask = "" authors_replied = [] @@ -961,6 +1010,7 @@ def map_item(post): video_thumb_urls.append(block["poster"][0]["url"]) else: video_thumb_urls.append("") + # Embedded link elif block_type == "link": link_urls.append(block["url"]) @@ -994,22 +1044,11 @@ def map_item(post): blocks.append(block_type) - # Parse note data + # Parse some note for note in post.get("notes", []): - - if note["type"] == "like": - if isinstance(like_count, str): - like_count = 0 - like_count += 1 - - if note["type"] in ("posted", "reblog"): - # If the post is a text reblog, it will also show up in the notes. - # We can skip these since the data is already in the main post dict. - if note["blog_name"] != post["blog_name"] and note["timestamp"] != post["timestamp"]: - authors_reblogged.insert(0, note["blog_name"]) - elif note["type"] == "reply": + if note["type"] == "reply": authors_replied.insert(0, note["blog_name"]) - replies.insert(0, note["blog_name"] + ": " + note["reply_text"]) + replies.insert(0, note["reply_text"]) # The API sometimes gives back a 'trail' of reblogged content # This includes reblogged content, but it's not entirely complete (e.g. no hashtags) @@ -1031,7 +1070,7 @@ def map_item(post): reblogged_text = "" body_reblogged.append("\n".join(reblogged_text)) - author_trail.append(reblog_author) + reblog_trail.append(reblog_author) return MappedItem({ "type": post["original_type"] if "original_type" in post else post["type"], @@ -1052,16 +1091,16 @@ def map_item(post): "body": "\n".join(raw_text), "body_markdown": "\n".join(formatted_text), "body_reblogged": "\n\n".join(body_reblogged), - "author_trail": ",".join(author_trail), + "reblog_trail": ",".join(reblog_trail), "parent_post_url": post.get("parent_post_url", ""), - "authors_reblogged": ",".join(authors_reblogged), "body_ask": "\n".join(body_ask), "author_ask": author_ask, "content_order": ",".join(blocks), "tags": ",".join(post.get("tags", "")), - "notes": post["note_count"], - "like_count": like_count, - "reply_count": len(authors_replied) if authors_replied else "", + "note_count": post["note_count"], + "reblog_count": post.get("reblog_count", ""), + "like_count": post.get("like_count", ""), + "reply_count": post.get("reply_count", ""), "authors_replied": ",".join(authors_replied), "replies": "\n\n".join(replies), "link_urls": ",".join(link_urls), @@ -1081,6 +1120,7 @@ def map_item(post): def format_tumblr_text(text_content): """ Format text content according to Tumblr's Neue Post Format definition. + Returns text as mardkown. :param content, list: The list of content as returned by the Tumblr API (can also be part of a `trail`) :returns dict @@ -1119,6 +1159,8 @@ def format_tumblr_text(text_content): elif fmt_type == "bold": inserts[n] = "**" + inserts[n] if opening else inserts[n] + "**" opening = False + + # Change text if inserts: extra_chars = 0 for n, insert in inserts.items(): From 6fe891c35ba1a7ee714b0bfb1f02a2a6b0c18659 Mon Sep 17 00:00:00 2001 From: Sal Hagen Date: Tue, 30 Jul 2024 11:02:36 +0200 Subject: [PATCH 110/204] Change options for Tumblr --- datasources/tumblr/search_tumblr.py | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/datasources/tumblr/search_tumblr.py b/datasources/tumblr/search_tumblr.py index d4997989b..c28465de0 100644 --- a/datasources/tumblr/search_tumblr.py +++ b/datasources/tumblr/search_tumblr.py @@ -120,8 +120,8 @@ def get_options(cls, parent_dataset=None, user=None): }, "get_reblogs": { "type": UserInput.OPTION_TOGGLE, - "help": "Add reblogs of collected posts", - "tooltip": "Add posts that reblogged posts from the initial query to the dataset. " + "help": "Add reblogs", + "tooltip": "Add posts that reblogged the initial results to the dataset. " "Limited to 1,000 reblogs per post.", "requires": "get_notes==true", "default": False @@ -131,7 +131,7 @@ def get_options(cls, parent_dataset=None, user=None): "help": "Reblogs to add", "options": { "text": "Only with added text", - "tag_or_text": "Only with added text and/or added hashtags" + "text_or_tag": "Only with added text and/or added hashtags" }, "tooltip": "What type of reblogs to add to the dataset.", "requires": "get_reblogs==true", @@ -140,7 +140,7 @@ def get_options(cls, parent_dataset=None, user=None): "follow_reblogs": { "type": UserInput.OPTION_TOGGLE, "help": "Add posts reblogged by collected posts", - "tooltip": "Also include all posts that were reblogged by the posts captured in the initial query. " + "tooltip": "Add all posts that were reblogged by the initial results to the dataset. " "This adds the entire reblog 'trail' from the initial post to the collected post. " "Only affects blog-level search; tag-level search only gets original posts.", "default": False @@ -298,8 +298,8 @@ def get_items(self, query): for trail_post in result.get("trail", []): # Some posts or blogs have been deleted; skip these if not "broken_blog_name" in trail_post: - if trail_post["id"] not in self.seen_ids: - extra_posts.add({"blog": trail_post["blog"], "id": trail_post["post"]["id"]}) + if trail_post["post_id"] not in self.seen_ids: + extra_posts.add({"blog": trail_post["blog"], "id": trail_post["post_id"]}) # Get note data. # Blog-level searches already have some note data, like reblogged text, @@ -355,7 +355,7 @@ def get_items(self, query): seen_notes.add(note["post_id"]) # Get tag-only reblogs; these aren't returned in `conversation` mode. - if reblog_type == "tag_or_text": + if reblog_type == "text_or_tag": tag_notes = self.get_notes(post["blog_name"], post["id"], mode="reblogs_with_tags", max_notes=1000) for tag_note in tag_notes: if tag_note["post_id"] not in seen_notes: @@ -374,7 +374,7 @@ def get_items(self, query): if note["type"] != "reblog": continue - elif reblog_type == "tag_or_text": + elif reblog_type == "text_or_tag": # Skip reblogs without tags or text if not note.get("tags") and not note.get("added_text"): continue @@ -767,8 +767,6 @@ def get_notes(self, blog_id, post_id, mode="conversation", max_notes=1000): first_batch = True note_metrics = {} - count += 1 - if self.interrupted: raise ProcessorInterruptedException("Interrupted while fetching post notes from Tumblr") @@ -814,9 +812,10 @@ def get_notes(self, blog_id, post_id, mode="conversation", max_notes=1000): # Add notes for note in notes["notes"]: - if mode == "converstaion" and note["type"] == "reply": + if mode == "conversation" and note["type"] == "reply": note_metrics["reply_count"] += 1 + count += 1 post_notes.append(note) # `conversation` mode groups likes and reblogs without commentary @@ -825,10 +824,15 @@ def get_notes(self, blog_id, post_id, mode="conversation", max_notes=1000): if "rollup_notes" in notes and "notes" in notes["rollup_notes"][0]: for note in notes["rollup_notes"][0]["notes"]: if note["type"] == "reblog": + count += 1 post_notes.append(note) + if count >= max_notes: + break + if notes.get("_links"): max_date = notes["_links"]["next"]["query_params"]["before_timestamp"] + self.dataset.update_status("Added %s notes for post %s:%s" % (count, blog_id, post_id)) time.sleep(.2) # If there's no `_links` key, that's all. From 622f7791a62beee46fe900a3e91f208ac41f8ce7 Mon Sep 17 00:00:00 2001 From: sal-phd-desktop Date: Tue, 30 Jul 2024 16:35:32 +0200 Subject: [PATCH 111/204] Revamp Tumblr search and allow reblogs in Explorer template --- datasources/tumblr/search_tumblr.py | 215 ++++++++++-------- webtool/lib/template_filters.py | 3 +- webtool/static/css/explorer/tumblr.css | 88 +++++-- .../explorer/datasource-templates/tumblr.html | 90 ++++++-- 4 files changed, 252 insertions(+), 144 deletions(-) diff --git a/datasources/tumblr/search_tumblr.py b/datasources/tumblr/search_tumblr.py index c28465de0..dccd4d4da 100644 --- a/datasources/tumblr/search_tumblr.py +++ b/datasources/tumblr/search_tumblr.py @@ -1,7 +1,7 @@ """ Search Tumblr via its API -Can fetch posts from specific blogs or with specific hashtags +Can fetch posts from specific blogs or with specific tags For Tumblr API documentation, see https://www.tumblr.com/docs/en/api/v2 For Neue Post Format documentation, see https://github.com/tumblr/docs/blob/master/npf-spec.md @@ -35,7 +35,7 @@ class SearchTumblr(Search): type = "tumblr-search" # job ID category = "Search" # category title = "Search Tumblr" # title displayed in UI - description = "Retrieve Tumblr posts by hashtag or blog." # description displayed in UI + description = "Retrieve Tumblr posts by tags or blogs." # description displayed in UI extension = "ndjson" # extension of result file, used internally and in UI is_local = False # Whether this datasource is locally scraped is_static = False # Whether this datasource is still updated @@ -47,6 +47,7 @@ class SearchTumblr(Search): max_retries = 3 # For API and connection retries. max_date_retries = 96 + 150 # For checking dates. 96 time retries of -6 hours (24 days), plus 150 extra for 150 weeks (~3 years). max_posts = 1000000 + max_reblogs = 1000 max_posts_reached = False api_limit_reached = False @@ -111,18 +112,19 @@ def get_options(cls, parent_dataset=None, user=None): }, "get_notes": { "type": UserInput.OPTION_TOGGLE, - "help": "Add note and reblog data (warning: slow)", - "tooltip": "Add post and reblog note data for every post. This includes note metrics (likes, replies, reblogs), " - "replies, and reblogged text. " - "Blog-level search includes reblogged text by default. " - "Enables adding reblogs as new posts", + "help": "Add note data (warning: slow)", + "tooltip": "Add note data for every post. This includes note metrics, " + "replies, reblogged text, and reblogged images. " + "Blog- and id-level search includes reblogged text by default. " + "Enables adding reblogs as new posts " + "Limited to the first 1,000 reblogs per post.", "default": False }, "get_reblogs": { "type": UserInput.OPTION_TOGGLE, "help": "Add reblogs", - "tooltip": "Add posts that reblogged the initial results to the dataset. " - "Limited to 1,000 reblogs per post.", + "tooltip": "Add reblogs to the dataset. " + "", "requires": "get_notes==true", "default": False }, @@ -131,25 +133,17 @@ def get_options(cls, parent_dataset=None, user=None): "help": "Reblogs to add", "options": { "text": "Only with added text", - "text_or_tag": "Only with added text and/or added hashtags" + "text_or_tag": "Only with added text and/or added tags (slow)" }, "tooltip": "What type of reblogs to add to the dataset.", "requires": "get_reblogs==true", "default": "text" }, - "follow_reblogs": { - "type": UserInput.OPTION_TOGGLE, - "help": "Add posts reblogged by collected posts", - "tooltip": "Add all posts that were reblogged by the initial results to the dataset. " - "This adds the entire reblog 'trail' from the initial post to the collected post. " - "Only affects blog-level search; tag-level search only gets original posts.", - "default": False - }, "reblog_outside_daterange": { "type": UserInput.OPTION_TOGGLE, - "help": "Allow reblogs and reblogged posts outside of date range", - "tooltip": "Whether to keep reblogs or reblogged posts that fall outside the optional date range limit inserted below.", - "default": True + "help": "Retain reblogs outside of date range", + "requires": "get_reblogs==true", + "default": False } } @@ -220,8 +214,6 @@ def get_items(self, query): queries = re.split(",|\n", parameters.get("query", "")) get_notes = parameters.get("get_notes", False) get_reblogs = parameters.get("get_reblogs", False) - reblog_crawl_depth = parameters.get("reblog_crawl_depth", 0) - follow_reblogs = parameters.get("follow_reblogs", False) reblog_type = parameters.get("reblog_type", False) reblog_outside_daterange = parameters.get("reblog_outside_daterange", False) @@ -235,13 +227,12 @@ def get_items(self, query): # Get date parameters min_date = parameters.get("min_date", None) max_date = parameters.get("max_date", None) + min_date = int(min_date) if min_date else 0 + max_date = int(max_date) if max_date else int(time.time()) - if min_date: - min_date = int(min_date) - if max_date: - max_date = int(max_date) - else: - max_date = int(time.time()) + if not queries: + self.dataset.finish_with_error("No queries given") + return # Connect to Tumblr API try: @@ -256,8 +247,9 @@ def get_items(self, query): self.dataset.finish_with_error(f"Could not connect to Tumblr API: {client_info.get('meta', {}).get('status', '')} - {client_info.get('meta', {}).get('msg', '')}") return - # for each tag or blog, get post - for query in queries: + # For each tag or blog, get posts + # with a limit of ten individual tasks. + for query in queries[:10]: query = query.strip() @@ -291,15 +283,17 @@ def get_items(self, query): self.dataset.update_status("API limit reached") break - # Check for reblogged posts in the reblog trail - if follow_reblogs: + # Check for reblogged posts in the reblog trail; + # we're addingt these if we're adding reblogs. + if get_reblogs: for result in results: # The post rail is stored in the trail list for trail_post in result.get("trail", []): # Some posts or blogs have been deleted; skip these if not "broken_blog_name" in trail_post: - if trail_post["post_id"] not in self.seen_ids: - extra_posts.add({"blog": trail_post["blog"], "id": trail_post["post_id"]}) + if trail_post["post"]["id"] not in self.seen_ids: + extra_posts.append({"blog": trail_post["blog"]["name"], + "id": trail_post["post"]["id"]}) # Get note data. # Blog-level searches already have some note data, like reblogged text, @@ -332,32 +326,35 @@ def get_items(self, query): # Add some metrics, like done in `get_notes`. notes = { "notes": post["notes"], - "reply_count": len([n for n in notes if n["type"] == "reply"]), - "reblog_count": len([n for n in notes if n["type"] == "reblog"]), - "like_count": len([n for n in notes if n["type"] == "like"]) + "reply_count": len([n for n in post["notes"] if n["type"] == "reply"]), + "reblog_count": len([n for n in post["notes"] if n["type"] == "reblog"]), + "like_count": len([n for n in post["notes"] if n["type"] == "like"]) } else: # Get notes via the API - # Only gets first 1,000 replies/text reblogs. + # Only gets first 1,000 replies or text/tag reblogs. # We're using different querying modes since # it'll speed up the process. The fastest is # `conversation`, which prioritises text reblogs and # replies, and also provides metrics on like and reblog counts; - # we'll use this as default. However, if the user - # has indicated they want to add reblogs with hashtags, - # we'll also have to use the `reblogs_with_tags` mode. + # we'll use this as default. If the user + # has indicated they also want to add reblogs with tags, + # we'll also use the `reblogs_with_tags` mode. seen_notes = set() - notes = self.get_notes(post["blog_name"], post["id"], mode="conversation", max_notes=1000) + notes = self.get_notes(post["blog_name"], post["id"], mode="conversation", max_reblogs=self.max_reblogs) + reblog_count = 0 for note in notes["notes"]: - if note["type"] == "reblog": - seen_notes.add(note["post_id"]) + if note["type"] == "reblog" or note["type"] == "reply": + if note["type"] == "reblog": # Replies don't have IDs + reblog_count += 1 + seen_notes.add(note["post_id"]) # Get tag-only reblogs; these aren't returned in `conversation` mode. - if reblog_type == "text_or_tag": - tag_notes = self.get_notes(post["blog_name"], post["id"], mode="reblogs_with_tags", max_notes=1000) - for tag_note in tag_notes: + if reblog_type == "text_or_tag" and reblog_count <= self.max_reblogs: + tag_notes = self.get_notes(post["blog_name"], post["id"], mode="reblogs_with_tags", max_reblogs=self.max_reblogs - reblog_count) + for tag_note in tag_notes["notes"]: if tag_note["post_id"] not in seen_notes: notes["notes"].append(tag_note) @@ -374,34 +371,19 @@ def get_items(self, query): if note["type"] != "reblog": continue - elif reblog_type == "text_or_tag": - # Skip reblogs without tags or text - if not note.get("tags") and not note.get("added_text"): - continue - - elif reblog_type == "text": - # Skip reblogs without added text - if not note.get("added_text"): - continue - - # Skip posts that we already collected - if note["post_id"] in self.seen_ids: - continue - - # Potentially skip new posts outside of the date range - if reblog_outside_daterange and (max_date or min_date): - if not min_date: - if note["timestamp"] >= max_date: - continue - elif not min_date <= note["timestamp"] <= max_date: - continue - - extra_posts.append({"blog": note["blog_name"], "id": note["post_id"]}) + if note["post_id"] not in self.seen_ids: + # Potentially skip extra posts outside of the date range + if not reblog_outside_daterange: + if note.get("timestamp"): + if not min_date >= note["timestamp"] >= max_date: + continue + extra_posts.append({"blog": note["blog_name"], "id": note["post_id"]}) + # Add reblogged posts and reblogs to dataset for i, extra_post in enumerate(extra_posts): - self.dataset.update_status("Adding %s/%s reblogged posts to the dataset" % (i, len(extra_posts))) + self.dataset.update_status("Adding %s/%s reblogs to the dataset" % (i, len(extra_posts))) if extra_post["id"] not in self.seen_ids: new_post = self.get_posts_by_blog(extra_post["blog"], extra_post["id"]) @@ -409,7 +391,15 @@ def get_items(self, query): if new_post: new_post = new_post[0] - # Add note data; these should already be retrieved above + # Potentially skip new posts outside of the date range + # We (also) do this after the API call because a timestamp is + # not always present in the notes data. + if not reblog_outside_daterange: + + if not min_date >= new_post["timestamp"] >= max_date: + continue + + # Add note data; these are already be retrieved above if get_notes: new_post = {**new_post, **retrieved_notes[new_post["reblog_key"]]} @@ -517,7 +507,7 @@ def get_posts_by_tag(self, tag, max_date=None, min_date=None, api_key=None): date_retries += 1 - # We're first gonna check carefully if there's small timegaps by + # We're first gonna check carefully if there's small time gaps by # decreasing by six hours. # If that didn't result in any new posts, also dedicate 12 date_retries # with reductions of six months, just to be sure there's no data from @@ -530,7 +520,7 @@ def get_posts_by_tag(self, tag, max_date=None, min_date=None, api_key=None): self.dataset.update_status("No new posts found for #%s - looking for posts before %s" % (tag, datetime.fromtimestamp(max_date).strftime("%Y-%m-%d %H:%M:%S"))) # We can stop when the max date drops below the min date. - if min_date: + if min_date != 0: if max_date <= min_date: break @@ -583,7 +573,7 @@ def get_posts_by_tag(self, tag, max_date=None, min_date=None, api_key=None): # Manually check if we have a lower date than the lowest allowed date already (min date). # This functonality is not natively supported by Tumblr. - if min_date: + if min_date != 0: if max_date < min_date: # Get rid of all the posts that are earlier than the max_date timestamp @@ -692,7 +682,7 @@ def get_posts_by_blog(self, blog, post_id=None, max_date=None, min_date=None): # time reductions. Make sure it's no error string, though. new_posts = [] for post in posts: - # Sometimes the API repsonds just with "meta", "response", or "errors". + # Sometimes the API reponds just with "meta", "response", or "errors". if isinstance(post, str): self.dataset.update_status("Couldn't add post:", post) retries += 1 @@ -705,7 +695,10 @@ def get_posts_by_blog(self, blog, post_id=None, max_date=None, min_date=None): posts = new_posts - if not post_id: + if not new_posts: + break + + if new_posts and not post_id: # Append posts to main list # Get the lowest date @@ -713,7 +706,7 @@ def get_posts_by_blog(self, blog, post_id=None, max_date=None, min_date=None): # Manually check if we have a lower date than the min date (`min_date`) already. # This functonality is not natively supported by Tumblr. - if min_date: + if min_date != 0: if max_date < min_date: # Get rid of all the posts that are earlier than the max_date timestamp @@ -738,19 +731,18 @@ def get_posts_by_blog(self, blog, post_id=None, max_date=None, min_date=None): return all_posts - def get_notes(self, blog_id, post_id, mode="conversation", max_notes=1000): + def get_notes(self, blog_id, post_id, mode="conversation", max_reblogs=1000): """ Gets data on the notes of a specific post. :param blog_id, str: The ID of the blog. :param post_id, str: The ID of the post. :param mode, str: The type of notes that get priority. `conversation` prioritises text reblogs and replies. - :param mode, max_notes: Maximum amount of notes to return. + :param mode, max_reblogs: Maximum amount of notes to return. :returns: a list with dictionaries of notes. """ - - note_metrics = {} + post_notes = [] max_date = None @@ -761,18 +753,26 @@ def get_notes(self, blog_id, post_id, mode="conversation", max_notes=1000): # so we'll cap this at 100 # Stop trying to fetch the notes after this many retries - max_notes_retries = 10 + max_reblogs_retries = 10 notes_retries = 0 first_batch = True note_metrics = {} + stop_collecting = False + + # For status updates + if mode == "conversation": + note_type = "reblogs with text" + elif mode == "reblogs_with_tags": + note_type = "reblogs with tags" + if self.interrupted: raise ProcessorInterruptedException("Interrupted while fetching post notes from Tumblr") while True: - if notes_retries >= max_notes_retries: + if notes_retries >= max_reblogs_retries: self.dataset.update_status("Too many connection errors; unable to collect notes for post %s" % post_id) self.failed_posts.append(post_id) break @@ -801,6 +801,7 @@ def get_notes(self, blog_id, post_id, mode="conversation", max_notes=1000): notes_retries = 0 # Add some metrics for the first response + # These metrics are only returned in conversation mode. if first_batch and mode == "conversation": note_metrics = { "note_count": notes["total_notes"], @@ -812,27 +813,33 @@ def get_notes(self, blog_id, post_id, mode="conversation", max_notes=1000): # Add notes for note in notes["notes"]: + + # Only count reblogs with added content (text or hashtags) + # towards the total count; replies are never too substantial, + # so we always collect them all. if mode == "conversation" and note["type"] == "reply": note_metrics["reply_count"] += 1 + elif mode == "conversation": + count += 1 + elif mode == "reblogs_with_tags": + # Skip notes without added tags + if not note.get("tags"): + continue + count += 1 - count += 1 post_notes.append(note) + + if count >= max_reblogs: + post_notes = post_notes[:count + note_metrics.get("reply_count", 0)] + stop_collecting = True - # `conversation` mode groups likes and reblogs without commentary - # in the `rollup_notes` list. We're adding reblogs to the post notes. - if mode == "conversation": - if "rollup_notes" in notes and "notes" in notes["rollup_notes"][0]: - for note in notes["rollup_notes"][0]["notes"]: - if note["type"] == "reblog": - count += 1 - post_notes.append(note) - - if count >= max_notes: + if stop_collecting: break if notes.get("_links"): max_date = notes["_links"]["next"]["query_params"]["before_timestamp"] - self.dataset.update_status("Added %s notes for post %s:%s" % (count, blog_id, post_id)) + + self.dataset.update_status("Collected %s %s for @%s:%s" % (count, note_type, blog_id, post_id)) time.sleep(.2) # If there's no `_links` key, that's all. @@ -918,7 +925,7 @@ def validate_query(query, request, user): raise QueryParametersException("Search query cannot be empty.") # So it shows nicely in the frontend. - items = ", ".join([item.lstrip().rstrip() for item in items if item]) + items = ", ".join([item.strip() for item in items if item]) # the dates need to make sense as a range to search within query["min_date"], query["max_date"] = query.get("daterange") @@ -945,6 +952,7 @@ def map_item(post): media_types = ["photo", "video", "audio"] image_urls = [] + image_urls_reblogged = [] video_urls = [] video_thumb_urls = [] audio_urls = [] @@ -1055,8 +1063,8 @@ def map_item(post): replies.insert(0, note["reply_text"]) # The API sometimes gives back a 'trail' of reblogged content - # This includes reblogged content, but it's not entirely complete (e.g. no hashtags) - # so we'll only store the original blog name and its text content. + # This includes reblogged content, but it's not entirely complete (e.g. no tags) + # so we'll only store the original blog name and its text + image content. for i, reblog in enumerate(post.get("trail", [])): reblogged_text = [] @@ -1069,6 +1077,8 @@ def map_item(post): for reblog_block in reblog.get("content", []): if reblog_block["type"] == "text": reblogged_text.append(reblog_block["text"]) + if reblog_block["type"] == "image": + image_urls_reblogged.append(reblog_block["media"][0]["url"]) if not reblogged_text: reblogged_text = "" @@ -1082,7 +1092,8 @@ def map_item(post): "author": post["blog_name"], "author_avatar_url": "https://api.tumblr.com/v2/blog/" + post["blog_name"] + "/avatar", "thread_id": post["reblog_key"], - "timestamp": post["timestamp"], + "timestamp": datetime.fromtimestamp(post["timestamp"]).strftime("%Y-%m-%d %H:%M:%S"), + "unix_timestamp": post["timestamp"], "author_subject": post["blog"]["title"], "author_description": strip_tags(post["blog"]["description"]), "author_url": post["blog"]["url"], @@ -1096,6 +1107,7 @@ def map_item(post): "body_markdown": "\n".join(formatted_text), "body_reblogged": "\n\n".join(body_reblogged), "reblog_trail": ",".join(reblog_trail), + "parent_post_author": post.get("reblogged_from_name", ""), "parent_post_url": post.get("parent_post_url", ""), "body_ask": "\n".join(body_ask), "author_ask": author_ask, @@ -1111,6 +1123,7 @@ def map_item(post): "link_titles": "\n".join(link_titles), "link_descriptions": "\n".join(link_descriptions), "image_urls": ",".join(image_urls), + "image_urls_reblogged": ",".join(image_urls_reblogged), "video_urls": ",".join(video_urls), "video_thumb_urls": ",".join(video_thumb_urls), "audio_urls": ",".join(audio_urls), diff --git a/webtool/lib/template_filters.py b/webtool/lib/template_filters.py index f7bfc23a2..f92bdfbd7 100644 --- a/webtool/lib/template_filters.py +++ b/webtool/lib/template_filters.py @@ -202,7 +202,8 @@ def _jinja2_filter_social_mediafy(body, datasource=""): }, "tumblr": { "mention": "https://tumblr.com/", - "markdown": True # Hashtags aren't linked in the post body + "markdown": True + # Hashtags aren't linked in the post body }, "linkedin": { "hashtag": "https://linkedin.com/feed/hashtag/?keywords=", diff --git a/webtool/static/css/explorer/tumblr.css b/webtool/static/css/explorer/tumblr.css index baeab6eac..9ef57cbcb 100644 --- a/webtool/static/css/explorer/tumblr.css +++ b/webtool/static/css/explorer/tumblr.css @@ -18,13 +18,54 @@ max-width: 540px; } +/* Author info */ +.author-row { + display: flex; + align-items: center; + padding: 19px; + font-size: 13px; + text-decoration: none; + color: #5e5e5e; + overflow: hidden; +} + .author { font-size: 13px; font-weight: bold; } +.author.pseudonymous { + width: 32px; + height: 32px; + border-radius: 3px; + color: white; + background-color: #2f4b66; + text-align: center; + vertical-align: middle; +} + +.author-row .author, .author-row a { + color: black; +} + +.author-row .author-avatar { + display: inline-block; +} + +.author-row .author { + display: inline-block; +} + +.author.pseudonymous i { + line-height: 32px; + color: white; +} + .author-avatar { width: 32px; +} + +.author-avatar:not(.reblog) { margin-right: 10px; } @@ -33,26 +74,27 @@ width: 100%; } -/* Main author info */ -header { - display: flex; - align-items: center; - padding: 19px; - text-decoration: none; - color: black; - overflow: hidden; +.author-row.reblog { + border-bottom: 1px solid rgba(0,0,0,0.13); } -header a { - color: black; +.reblog-notice { + padding-left: 3px; + padding-right: 3px; } -header .author-avatar { - display: inline-block; +.reblog-icon { + height: 32px; } -header .author { - display: inline-block; +.reblog-icon i { + background-color: #00cf35; + border-radius: 100%; + color: white; + font-size: 8px; + padding: 4px; + margin-top: 20px; + margin-left: -7px; } /* Media */ @@ -99,16 +141,16 @@ header .author { padding: 0px 19px 0px 19px; } -.post-content .body, .body-reblogged { - white-space: pre-wrap; +.post-content.reblog { + padding-bottom: 19px; line-height: 1.5em; + border-bottom: 1px solid rgba(0,0,0,0.13); } .post-content .body { - padding: 3px 0px 3px 0px; + padding: 10px 0px 10px 0px; } - .post-content h1 { font-size: 20px; font-weight: bold; @@ -183,9 +225,13 @@ a.embedded-link:hover { } .tags { - padding-top: 5px; + display: flex; + align-items: center; + padding: 19px 0px 19px 0px; + font-size: 15px; + text-decoration: none; list-style-type: none; - color: #5e5e5e; + color: #5e5e5e; word-break: break-all; } @@ -201,7 +247,7 @@ a.embedded-link:hover { /* Post footer */ footer { - margin: 19px; + margin: 0px 19px 19px 19px; padding-top: 19px; border-top: 1px solid rgba(0,0,0,0.13); } diff --git a/webtool/templates/explorer/datasource-templates/tumblr.html b/webtool/templates/explorer/datasource-templates/tumblr.html index 3866f3c70..972766273 100644 --- a/webtool/templates/explorer/datasource-templates/tumblr.html +++ b/webtool/templates/explorer/datasource-templates/tumblr.html @@ -1,37 +1,85 @@ +{% set reblog = True if post.parent_post_author else False %}
    +
    {% if not pseudonymised %} - + {% if post["author_avatar_url"] %} -
    + {% endif %} - {{ post.get("author") }} + {% if reblog %} + + {% endif %} + {{ post.get("author") }} + {% if reblog %} reblogged {{ post.parent_post_author }}{% endif %} {% else %} - + + {% if post.parent_post_author %} reblogged{% endif %} {% endif %} +
    - -{% if post.author_trail %} +{% if reblog %} -{% for reblog_author in post.author_trail.split(",") %} -
    - {{ reblog_author }} -

    - {{ post.body_reblogged.split("\n\n")[loop.index - 1] }} -

    -
    -{% endfor %} + {% for reblog_author in post.reblog_trail.split(",") %} +
    + {% if not pseudonymised %} +
    + + + +
    + {{ reblog_author }} + {% else %} + + {% endif %} +
    +
    + {% if post.get("image_urls_reblogged") %} + {% for image_url in post.image_urls_reblogged.split(",") %} +
    + +
    + {% endfor %} + {% endif %} +
    + {{ post.body_reblogged.split("\n\n")[loop.index - 1] }} +
    +
    + {% endfor %} + + {% if post.body %} +
    + {% if not pseudonymised %} + + + + {% if post["author_avatar_url"] %} +
    + + + +
    + {% endif %} + + {{ post.get("author") }} + + {% else %} + + + {% endif %} +
    + {% endif %} {% endif %} @@ -44,7 +92,7 @@ {% for block in content_order %} {% if block == "text" %} -

    {{ post.get("body_markdown").split("\n")[block_counts.text] | markdown | social_mediafy(datasource='tumblr') | safe }}

    +
    {{ post.get("body_markdown").split("\n")[block_counts.text] | markdown | social_mediafy(datasource='tumblr') | safe }}
    {% set block_counts.text = block_counts.text + 1 %} {% elif block == "image" %} @@ -128,24 +176,24 @@
    -
    {{ post.timestamp | datetime(fmt="%d %b %Y, %H:%M", wrap=False) }} UTC
    +
    {{ post.unix_timestamp | datetime(fmt="%d %b %Y, %H:%M", wrap=False) }} UTC
    - {% if post.notes %} + {% if post.note_count %}
    - {{ post.get("notes") | commafy }} note{% if post.get("notes", 0) > 1 %}s{% endif %} + {{ post.get("note_count") | commafy }} note{% if post.get("note_count", 0) > 1 %}s{% endif %} {% if post.get("reblog_count") %} - {{ post.reblog_count }} + {{ post.reblog_count | commafy }} {% endif %} {% if post.get("like_count") %} - + {% endif %} {% if post.get("reply_count") %} - {{ post.get("reply_count") }} + {{ post.get("reply_count") | commafy }} {% endif %}
    {% if post.get("authors_replied") %} From c8f204e1ef65b8bc995a03f32126f93df0b8c46a Mon Sep 17 00:00:00 2001 From: sal-phd-desktop Date: Wed, 31 Jul 2024 13:02:13 +0200 Subject: [PATCH 112/204] Improve and fix revamped Tumblr search --- datasources/tumblr/search_tumblr.py | 112 ++++++++++-------- webtool/static/css/explorer/tumblr.css | 1 + .../explorer/datasource-templates/tumblr.html | 3 + 3 files changed, 69 insertions(+), 47 deletions(-) diff --git a/datasources/tumblr/search_tumblr.py b/datasources/tumblr/search_tumblr.py index dccd4d4da..efd932dc5 100644 --- a/datasources/tumblr/search_tumblr.py +++ b/datasources/tumblr/search_tumblr.py @@ -12,6 +12,7 @@ import pytumblr import requests import re +import json from requests.exceptions import ConnectionError from datetime import datetime from ural import urls_from_text @@ -96,18 +97,18 @@ def get_options(cls, parent_dataset=None, user=None): "type": UserInput.OPTION_INFO, "help": "Retrieve any kind of Tumblr posts with specific tags or from specific blogs. Gets 100.000 posts " "at max. You may insert up to ten tags or blogs.\n\n" - "Blog-level search also returns reblogs. *Tag-level search only returns original posts*. " - "Reblogs of tagged posts can be retrieved via the options below.\n\n" + "*Tag-level search only returns original posts*. " + "Reblogs of tagged posts can be retrieved via the options below. Blog-level search also returns reblogs.\n\n" "Tag search only get posts with the exact tag you insert. Querying " - "`gogh` will thus not get posts only tagged with `van gogh`.\n\n" + "`gogh` will not get posts tagged with `van gogh`.\n\n" "A `#` before a tag is optional. Blog names must start with `@`.\n\n" - "Individual posts can be retrieved through the format `@blogname:post_id`.\n\n" + "Individual posts can be captured by inserting their URL or via the format `@blogname:post_id`.\n\n" "Keyword search is not allowed by the [Tumblr API](https://api.tumblr.com).\n\n" "If this 4CAT reached its Tumblr API rate limit, try again 24 hours later." }, "query": { "type": UserInput.OPTION_TEXT_LARGE, - "help": "Tags and/or blogs", + "help": "Tags, blogs, or post URLs. Seperate with comma or newline.", "tooltip": "E.g. #research tools, @4catblog, @4catblog:12347714095" }, "get_notes": { @@ -188,8 +189,9 @@ def get_options(cls, parent_dataset=None, user=None): } options["date-intro"] = { "type": UserInput.OPTION_INFO, - "help": "**Note:** The [Tumblr API](https://api.tumblr.com) is volatile: when fetching sporadically used " - "tags, it may return zero posts, even though older posts exist. To mitigate this, 4CAT decreases " + "help": "**Note:** The [Tumblr API](https://api.tumblr.com) is very volatile. Queries may not return " + "posts, even if posts exists. Waiting for a while and querying again can help, even with identical queries.\n\n" + "Additionally, older tagged posts may not be returned, even if they exist. To mitigate this, 4CAT decreases " "the date parameter (before) with six hours and sends the query again. This often " "successfully returns older, un-fetched posts. If it didn't find new data after 96 retries (24 " "days), it checks for data up to six years before the last date, decreasing 12 times by 6 months. " @@ -253,13 +255,42 @@ def get_items(self, query): query = query.strip() + post_id = None + + # Format @blogname:id if query.startswith("@"): - blog_name = query[1:] # Get a possible post ID - post_id = None + blog_name = query[1:] if ":" in query: blog_name, post_id = blog_name.split(":") + + new_results = self.get_posts_by_blog(blog_name, post_id=post_id, max_date=max_date, min_date=min_date) + + # Post URL + elif "tumblr.com/" in query: + + try: + # Format https://{blogname}.tumblr.com/post/{post_id} + if "/post/" in query: + blog_name = query.split(".tumblr.com")[0].replace("https://", "").replace("www.", "").strip() + post_id = query.split("/")[-1].strip() + # May also be a slug string.. + if not post_id.isdigit(): + post_id = query.split("/")[-2].strip() + + # Format https://tumblr.com/{blogname}/{post_id} + else: + blog_and_id = query.split("tumblr.com/")[-1] + blog_and_id = blog_and_id.replace("blog/view/", "") # Sometimes present in the URL + blog_name, post_id = blog_and_id.split("/") + if not post_id.isdigit(): + post_id = query.split("/")[-2].strip() + + except IndexError: + self.dataset.update_status("Invalid post URL: %s" % query) + continue + new_results = self.get_posts_by_blog(blog_name, post_id=post_id, max_date=max_date, min_date=min_date) # Get tagged post @@ -284,7 +315,7 @@ def get_items(self, query): break # Check for reblogged posts in the reblog trail; - # we're addingt these if we're adding reblogs. + # we're storing their post IDs and blog names for later, if we're adding reblogs. if get_reblogs: for result in results: # The post rail is stored in the trail list @@ -322,7 +353,7 @@ def get_items(self, query): # In the case of posts with just a few notes, # we may have all the possible notes in the retrieved JSON. - elif len(post["notes"]) == post["note_count"]: + elif "notes" in post and (len(post["notes"]) == post["note_count"]): # Add some metrics, like done in `get_notes`. notes = { "notes": post["notes"], @@ -346,10 +377,9 @@ def get_items(self, query): notes = self.get_notes(post["blog_name"], post["id"], mode="conversation", max_reblogs=self.max_reblogs) reblog_count = 0 for note in notes["notes"]: - if note["type"] == "reblog" or note["type"] == "reply": - if note["type"] == "reblog": # Replies don't have IDs - reblog_count += 1 - seen_notes.add(note["post_id"]) + if note["type"] == "reblog": # Replies don't have IDs + reblog_count += 1 + seen_notes.add(note["post_id"]) # Get tag-only reblogs; these aren't returned in `conversation` mode. if reblog_type == "text_or_tag" and reblog_count <= self.max_reblogs: @@ -362,7 +392,7 @@ def get_items(self, query): results[i] = {**results[i], **notes} retrieved_notes[post["reblog_key"]] = notes - # Get the full data for certain reblogs and add them as new posts + # Identify which notes/reblogs we can collect as new posts if get_reblogs: for note in notes["notes"]: @@ -378,6 +408,7 @@ def get_items(self, query): if note.get("timestamp"): if not min_date >= note["timestamp"] >= max_date: continue + extra_posts.append({"blog": note["blog_name"], "id": note["post_id"]}) # Add reblogged posts and reblogs to dataset @@ -386,19 +417,17 @@ def get_items(self, query): self.dataset.update_status("Adding %s/%s reblogs to the dataset" % (i, len(extra_posts))) if extra_post["id"] not in self.seen_ids: - new_post = self.get_posts_by_blog(extra_post["blog"], extra_post["id"]) + + # Potentially skip new posts outside of the date range + # not always present in the notes data. + if not reblog_outside_daterange and (max_date and min_date): + new_post = self.get_posts_by_blog(extra_post["blog"], extra_post["id"], max_date=max_date, min_date=min_date) + else: + new_post = self.get_posts_by_blog(extra_post["blog"], extra_post["id"]) if new_post: new_post = new_post[0] - - # Potentially skip new posts outside of the date range - # We (also) do this after the API call because a timestamp is - # not always present in the notes data. - if not reblog_outside_daterange: - if not min_date >= new_post["timestamp"] >= max_date: - continue - # Add note data; these are already be retrieved above if get_notes: new_post = {**new_post, **retrieved_notes[new_post["reblog_key"]]} @@ -693,33 +722,21 @@ def get_posts_by_blog(self, blog, post_id=None, max_date=None, min_date=None): self.seen_ids.add(post["id"]) new_posts.append(post) - posts = new_posts - + # Possibly only keep posts within the date range. + if max_date and min_date: + new_posts = [p for p in new_posts if min_date <= p["timestamp"] <= max_date] + if not new_posts: break - if new_posts and not post_id: - - # Append posts to main list - # Get the lowest date - max_date = sorted([post["timestamp"] for post in posts])[0] - - # Manually check if we have a lower date than the min date (`min_date`) already. - # This functonality is not natively supported by Tumblr. - if min_date != 0: - if max_date < min_date: - - # Get rid of all the posts that are earlier than the max_date timestamp - posts = [post for post in posts if post["timestamp"] >= min_date] + # Append posts to main list + all_posts += new_posts - if posts: - all_posts += posts - break + # Get the lowest date for next loop + max_date = sorted([post["timestamp"] for post in posts])[0] retries = 0 - all_posts += posts - if len(all_posts) >= self.max_posts: self.max_posts_reached = True break @@ -798,6 +815,7 @@ def get_notes(self, blog_id, post_id, mode="conversation", max_reblogs=1000): break if "notes" in notes: + notes_retries = 0 # Add some metrics for the first response @@ -805,8 +823,8 @@ def get_notes(self, blog_id, post_id, mode="conversation", max_reblogs=1000): if first_batch and mode == "conversation": note_metrics = { "note_count": notes["total_notes"], - "reblog_count": notes["total_reblogs"], - "like_count": notes["total_likes"], + "reblog_count": notes.get("total_reblogs", 0), + "like_count": notes.get("total_likes", 0), "reply_count": 0 } first_batch = False diff --git a/webtool/static/css/explorer/tumblr.css b/webtool/static/css/explorer/tumblr.css index 9ef57cbcb..d792a915f 100644 --- a/webtool/static/css/explorer/tumblr.css +++ b/webtool/static/css/explorer/tumblr.css @@ -35,6 +35,7 @@ } .author.pseudonymous { + display: inline-block; width: 32px; height: 32px; border-radius: 3px; diff --git a/webtool/templates/explorer/datasource-templates/tumblr.html b/webtool/templates/explorer/datasource-templates/tumblr.html index 972766273..65ad402ac 100644 --- a/webtool/templates/explorer/datasource-templates/tumblr.html +++ b/webtool/templates/explorer/datasource-templates/tumblr.html @@ -203,6 +203,9 @@
    {% if not pseudonymised %} + {% else %} + + {% endif %}
    From da62b83b08b208cfcbeeddf4bc7a2d719fec5374 Mon Sep 17 00:00:00 2001 From: sal-phd-desktop Date: Wed, 31 Jul 2024 14:08:54 +0200 Subject: [PATCH 113/204] Some more warnings in the Tumblr search info --- datasources/tumblr/search_tumblr.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/datasources/tumblr/search_tumblr.py b/datasources/tumblr/search_tumblr.py index efd932dc5..0f696507b 100644 --- a/datasources/tumblr/search_tumblr.py +++ b/datasources/tumblr/search_tumblr.py @@ -108,8 +108,8 @@ def get_options(cls, parent_dataset=None, user=None): }, "query": { "type": UserInput.OPTION_TEXT_LARGE, - "help": "Tags, blogs, or post URLs. Seperate with comma or newline.", - "tooltip": "E.g. #research tools, @4catblog, @4catblog:12347714095" + "help": "Tags, blogs, or post URLs.", + "tooltip": " Seperate with comma or newline. Example:\n#research tools, @4catblog, https://tumblr.com/4catblog/12347714095" }, "get_notes": { "type": UserInput.OPTION_TOGGLE, @@ -190,7 +190,8 @@ def get_options(cls, parent_dataset=None, user=None): options["date-intro"] = { "type": UserInput.OPTION_INFO, "help": "**Note:** The [Tumblr API](https://api.tumblr.com) is very volatile. Queries may not return " - "posts, even if posts exists. Waiting for a while and querying again can help, even with identical queries.\n\n" + "posts, even if posts exists. Waiting for a while and querying again can help, even with identical queries. " + "Consider carrying out multiple queries and using the 'Merge datasets' processor to limit false negatives.\n\n" "Additionally, older tagged posts may not be returned, even if they exist. To mitigate this, 4CAT decreases " "the date parameter (before) with six hours and sends the query again. This often " "successfully returns older, un-fetched posts. If it didn't find new data after 96 retries (24 " From 0e739b327d8e1e6e1141830630e5e038d65f8843 Mon Sep 17 00:00:00 2001 From: sal-phd-desktop Date: Wed, 31 Jul 2024 17:34:11 +0200 Subject: [PATCH 114/204] Migrate script for expanded annotation table --- backend/database.sql | 36 ++++- helper-scripts/migrate/migrate-1.45-1.46.py | 155 ++++++++++++++++++++ 2 files changed, 187 insertions(+), 4 deletions(-) create mode 100644 helper-scripts/migrate/migrate-1.45-1.46.py diff --git a/backend/database.sql b/backend/database.sql index 33f0ea393..f51df7e5d 100644 --- a/backend/database.sql +++ b/backend/database.sql @@ -67,11 +67,39 @@ CREATE TABLE datasets_owners ( CREATE UNIQUE INDEX datasets_owners_user_key_idx ON datasets_owners("name" text_ops,key text_ops); - -- annotations -CREATE TABLE IF NOT EXISTS annotations ( - key text UNIQUE PRIMARY KEY, - annotations text DEFAULT '' +CREATE TABLE IF NOT EXISTS annotations_new ( + id SERIAL PRIMARY KEY, + post_id TEXT, + dataset TEXT, + timestamp INT DEFAULT 0, + timestamp_created INT DEFAULT 0, + label TEXT, + type TEXT, + options TEXT, + value TEXT, + author TEXT, + is_processor BOOLEAN DEFAULT FALSE, + metadata TEXT +); + +CREATE UNIQUE INDEX IF NOT EXISTS annotation_id + ON annotations_new ( + id +); +CREATE UNIQUE INDEX IF NOT EXISTS annotation_unique + ON annotations_new ( + label, + dataset, + post_id +); +CREATE INDEX IF NOT EXISTS annotation_value + ON annotations_new ( + value +); +CREATE INDEX IF NOT EXISTS annotation_timestamp + ON annotations_new ( + timestamp ); -- metrics diff --git a/helper-scripts/migrate/migrate-1.45-1.46.py b/helper-scripts/migrate/migrate-1.45-1.46.py new file mode 100644 index 000000000..2246d0ea2 --- /dev/null +++ b/helper-scripts/migrate/migrate-1.45-1.46.py @@ -0,0 +1,155 @@ +# Update the 'annotations' table so every annotation has its own row. +# also add extra data +import sys +import os +import json + +from pathlib import Path + +sys.path.insert(0, os.path.join(os.path.abspath(os.path.dirname(__file__)), "../..")) +from common.lib.database import Database +from common.lib.logger import Logger + +log = Logger(output=True) + +import configparser + +ini = configparser.ConfigParser() +ini.read(Path(__file__).parent.parent.parent.resolve().joinpath("config/config.ini")) +db_config = ini["DATABASE"] + +db = Database(logger=log, dbname=db_config["db_name"], user=db_config["db_user"], password=db_config["db_password"], + host=db_config["db_host"], port=db_config["db_port"], appname="4cat-migrate") + +print(" Creating new annotations table...") +db.execute(""" +CREATE TABLE IF NOT EXISTS annotations_new ( + id SERIAL PRIMARY KEY, + field_id SERIAL, + post_id TEXT, + dataset TEXT, + timestamp INT DEFAULT 0, + timestamp_created INT DEFAULT 0, + label TEXT, + type TEXT, + options TEXT, + value TEXT, + author TEXT, + is_processor BOOLEAN DEFAULT FALSE, + metadata TEXT +); +""") + +print(" Creating indexes for annotations table...") +db.execute(""" +CREATE UNIQUE INDEX IF NOT EXISTS annotation_id + ON annotations_new ( + id +); +CREATE UNIQUE INDEX IF NOT EXISTS annotation_unique + ON annotations_new ( + label, + dataset, + post_id +); +CREATE INDEX IF NOT EXISTS annotation_value + ON annotations_new ( + value +); +CREATE INDEX IF NOT EXISTS annotation_timestamp + ON annotations_new ( + timestamp +); +""") + +print(" Transferring old annotations to new annotations table...") + +annotations = db.fetchall("SELECT * FROM annotations;") + +if not annotations: + print(" No annotation fields to transfer, skipping...") + +else: + print(" Transferring annotations") + + count = 0 + skipped_count = 0 + + columns = "post_id,field_id,dataset,timestamp,timestamp_created,label,type,options,value,author,is_processor,metadata" + + # Each row are **all** annotations per dataset + for row in annotations: + + if not row.get("annotations"): + print(" No annotations for dataset %s, skipping..." % row["key"]) + skipped_count += 1 + continue + + dataset = db.fetchone("SELECT * FROM datasets WHERE key = '" + row["key"] + "';") + + # If the dataset is not present anymore, + # we're going to skip these annotations; + # likely the dataset is expired. + if not dataset: + print(" No dataset found for key %s, skipping..." % row["key"]) + skipped_count += 1 + continue + + annotation_fields = json.loads(dataset["annotation_fields"]) + author = dataset.get("creator", "") + + # Loop through all annotated posts + for post_id, post_annotations in json.loads(row["annotations"]).items(): + + # Loop through individual annotations per post + for label, value in post_annotations.items(): + + # Get the ID of this particular annotation field + field_id = [k for k, v in annotation_fields.items() if v["label"] == label] + + if field_id: + field_id = field_id[0] + + # Skip if this field was not saved to the datasets table + if not field_id or field_id not in annotation_fields: + print(" Annotation field ID not saved to datasets table, skipping...") + skipped_count += 1 + continue + + ann_type = annotation_fields[field_id]["type"] + options = annotation_fields[field_id]["options"] if "options" in annotation_fields[field_id] else "" + options = {k: v for d in options for k, v in d.items()} # flatten + + if isinstance(value, list): + value = ",".join(value) + + inserts = [( + str(post_id), # post_id; needs to be a string, changes per data source. + int(field_id), # field_id; this is an ID for the same type of input field. + row["key"], # dataset + dataset["timestamp"], # timestamp + dataset["timestamp"], # timestamp_created + label, # label + ann_type, # type + json.dumps(options) if options else "", # options; each option has a key and a value. + value, # value + author, # author + False, # is_processor + json.dumps({}), # metadata + )] + + db.execute("INSERT INTO annotations_new (" + columns + ") VALUES %s", replacements=inserts) + + count += 1 + + if count % 10 == 0: + print(" Transferred %s annotations..." % count) + +print(" Done, transferred %s annotations and skipped %s annotations" % (count, skipped_count)) +print(" Deleting old annotations table...") +db.execute("DROP TABLE annotations") + +print(" Renaming new annotations table...") +db.execute("ALTER TABLE annotations_new RENAME TO annotations;") + +print(" - done!") \ No newline at end of file From 3c524c110aab89328539ed0c7a62cac5d547ba63 Mon Sep 17 00:00:00 2001 From: sal-phd-desktop Date: Wed, 31 Jul 2024 17:39:41 +0200 Subject: [PATCH 115/204] Get annotations per row --- common/lib/dataset.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/common/lib/dataset.py b/common/lib/dataset.py index 656f1d007..1d90111c7 100644 --- a/common/lib/dataset.py +++ b/common/lib/dataset.py @@ -850,15 +850,10 @@ def get_annotation_fields(self): def get_annotations(self): """ Retrieves the annotations for this dataset. - return dict: The annotations + return list: All annotations, each in their own dictionary. """ - annotations = self.db.fetchone("SELECT annotations FROM annotations WHERE key = %s;", (self.key,)) - - if annotations and annotations.get("annotations"): - return json.loads(annotations["annotations"]) - else: - return None + return self.db.fetchone("SELECT * FROM annotations WHERE dataset = %s;", (self.key,)) def save_annotation_fields(self, annotation_fields): """ From 9cfd5bd32f813d59c81684db318b906b83b469b9 Mon Sep 17 00:00:00 2001 From: Sal Hagen Date: Wed, 7 Aug 2024 11:40:02 +0200 Subject: [PATCH 116/204] First steps in revamping annotation saving --- VERSION | 2 +- backend/database.sql | 2 +- backend/workers/expire_items.py | 1 + common/lib/dataset.py | 540 ++++++++++++-------- common/lib/exceptions.py | 5 + helper-scripts/migrate/migrate-1.45-1.46.py | 69 ++- webtool/static/js/explorer.js | 130 +++-- webtool/views/views_explorer.py | 12 +- 8 files changed, 479 insertions(+), 282 deletions(-) diff --git a/VERSION b/VERSION index 7a39f43c7..fa2cb2583 100644 --- a/VERSION +++ b/VERSION @@ -1,4 +1,4 @@ -1.44 +1.46 This file should not be modified. It is used by 4CAT to determine whether it needs to run migration scripts to e.g. update the database structure to a more diff --git a/backend/database.sql b/backend/database.sql index f51df7e5d..7e551e5d8 100644 --- a/backend/database.sql +++ b/backend/database.sql @@ -79,7 +79,7 @@ CREATE TABLE IF NOT EXISTS annotations_new ( options TEXT, value TEXT, author TEXT, - is_processor BOOLEAN DEFAULT FALSE, + by_processor BOOLEAN DEFAULT FALSE, metadata TEXT ); diff --git a/backend/workers/expire_items.py b/backend/workers/expire_items.py index ed4d1cc0f..ddf8afbdb 100644 --- a/backend/workers/expire_items.py +++ b/backend/workers/expire_items.py @@ -62,6 +62,7 @@ def expire_datasets(self): dataset = DataSet(key=dataset["key"], db=self.db) if dataset.is_expired(): self.log.info(f"Deleting dataset {dataset.key} (expired)") + dataset.delete_annotations(dataset_key=dataset.key) dataset.delete() except DataSetNotFoundException: diff --git a/common/lib/dataset.py b/common/lib/dataset.py index 1d90111c7..1418494e1 100644 --- a/common/lib/dataset.py +++ b/common/lib/dataset.py @@ -19,7 +19,7 @@ from common.lib.item_mapping import MappedItem, MissingMappedField, DatasetItem from common.lib.fourcat_module import FourcatModule from common.lib.exceptions import (ProcessorInterruptedException, DataSetException, DataSetNotFoundException, - MapItemException, MappedItemIncompleteException) + MapItemException, MappedItemIncompleteException, AnnotationException) class DataSet(FourcatModule): @@ -835,6 +835,8 @@ def get_columns(self): def get_annotation_fields(self): """ Retrieves the saved annotation fields for this dataset. + These are stored in the annotations table. + :return dict: The saved annotation fields. """ @@ -847,218 +849,12 @@ def get_annotation_fields(self): return annotation_fields - def get_annotations(self): - """ - Retrieves the annotations for this dataset. - return list: All annotations, each in their own dictionary. - """ - - return self.db.fetchone("SELECT * FROM annotations WHERE dataset = %s;", (self.key,)) - - def save_annotation_fields(self, annotation_fields): - """ - Save the annotation fields of a dataset to the datasets table. - If changes to the annotation fields affect older, existing annotations, - this function also updates or deletes those values. - - :param dict annotation_fields: Annotation fields, with a field ID as key - :return int: The number of annotation fields saved. - """ - - # Do some preparations - new_field_ids = set(annotation_fields.keys()) - text_fields = ["textarea", "text"] - option_fields = set() - - # Get existing annotation fields. - old_fields = self.get_annotation_fields() - - # We're saving the new annotation fields as-is - self.db.execute("UPDATE datasets SET annotation_fields = %s WHERE key = %s;", (json.dumps(annotation_fields), self.key)) - - # If new annotation fields change the annotations already saved (e.g. if a field is deleted), - # we must also check if we should update annotation data. - # This can get quite complex! - if old_fields: - annotations = self.get_annotations() - - if old_fields and annotations: - - fields_to_delete = set() - labels_to_update = {} - options_to_delete = set() - options_to_update = {} - - for field_id, field in old_fields.items(): - - # We'll delete all prior annotations for a field if its input field is deleted - if field_id not in new_field_ids: - - # Labels are used as keys in the annotations table - # They should already be unique, so that's okay. - fields_to_delete.add(field["label"]) - continue - - # If the type has changed, also delete prior references (except between text and textarea) - new_type = annotation_fields[field_id]["type"] - if field["type"] != new_type: - - if not field["type"] in text_fields and not new_type in text_fields: - fields_to_delete.add(field["label"]) - continue - - # If the label has changed, change it in the old annotations - old_label = old_fields[field_id]["label"] - new_label = annotation_fields[field_id]["label"] - - if old_label != new_label: - labels_to_update[old_label] = new_label - - # Check if the options for dropdowns or checkboxes have changed - if new_type == "checkbox" or new_type == "dropdown": - - if "options" in old_fields[field_id]: - - option_fields.add(old_fields[field_id]["label"]) - new_options = annotation_fields[field_id]["options"] - - new_ids = [list(v.keys())[0] for v in new_options] - new_ids = [list(v.keys())[0] for v in new_options] - - # If it's a dropdown or checkbox.. - for option in old_fields[field_id]["options"]: - option_id = list(option.keys())[0] - option_label = list(option.values())[0] - - # If this ID is not present anymore, delete it - if option_id not in new_ids: - options_to_delete.add(option_label) - continue - - # Change the label if it has changed. Bit ugly but it works. - new_label = [list(new_option.values())[0] for i, new_option in enumerate(new_options) if list(new_options[i].keys())[0] == option_id][0] - - if option_label != new_label: - options_to_update[option_label] = new_label - - # Loop through the old annotations if things need to be changed - if fields_to_delete or labels_to_update or options_to_update or options_to_delete: - - for post_id in list(annotations.keys()): - - for field_label in list(annotations[post_id].keys()): - - # Delete the field entirely - if field_label in fields_to_delete: - del annotations[post_id][field_label] - continue - - # Update the label - if field_label in labels_to_update: - annotations[post_id][labels_to_update[field_label]] = annotations[post_id].pop(field_label) - field_label = labels_to_update[field_label] - - # Update or delete option values - if field_label in option_fields: - options_inserted = annotations[post_id][field_label] - - # We can just delete/change the entire annotation if its a string - if type(options_inserted) == str: - - # Delete the option if it's not present anymore - if options_inserted in options_to_delete: - del annotations[post_id][field_label] - - # Update the option label if it has changed - elif options_inserted in options_to_update: - annotations[post_id][field_label] = options_to_update[options_inserted] - - # For lists (i.e. checkboxes), we have to loop - elif type(options_inserted) == list: - - for option_inserted in options_inserted: - - # Delete the option if it's not present anymore - if option_inserted in options_to_delete: - annotations[post_id][field_label].remove(option_inserted) - - # Update the option label if it has changed - elif option_inserted in options_to_update: - annotations[post_id][field_label] = options_to_update[option_inserted] - - # Delete entire post dict if there's nothing left - if not annotations[post_id]: - del annotations[post_id] - - # Save annotations as an empty string if there's none. - if not annotations: - annotations = "" - - # Save to the annotations table. - self.save_annotations(annotations) - - return len(annotation_fields) - - def save_annotations(self, annotations): - """ - Saves annotations for a dataset to the annotations table. - - :param dict annotations: Annotations dict, with post IDs as keys. - :return int: The number of posts with annotations. - - """ - - # If there were already annotations added, we need to make sure - # we're not incorrectly overwriting existing ones. - # We also need to check whether any of the input fields has changed. - # If so, we're gonna edit or remove their old values. - old_annotations = self.get_annotations() - delete_annotations = False - - if old_annotations and annotations: - # Loop through all new annotations and add/overwrite them - # with the old annotations dict. - for post_id in list(annotations.keys()): - old_annotations[post_id] = annotations[post_id] - - # Empty strings, lists, or None as input values get removed - fields_to_delete = [] - for label, values in old_annotations[post_id].items(): - if not values: - fields_to_delete.append(label) - for label in fields_to_delete: - del old_annotations[post_id][label] - delete_annotations = True - - # Empty lists/dicts get removed - if not old_annotations[post_id]: - del old_annotations[post_id] - delete_annotations = True - - annotations = old_annotations - - # If there's nothing to save or delete, do nothing - if not annotations and not delete_annotations: - return 0 - - # If the annotations are empty, remove the row from the annotations table - if len(annotations) == 0: - self.db.delete("annotations", {"key": self.key}) - return 0 - - # If there's something to add or change, - # we're saving all annotations as a JSON string - annotations = json.dumps(annotations) - self.db.upsert("annotations", {"key": self.key, "annotations": annotations}, constraints=["key"]) - - return len(annotations) - def update_label(self, label): """ Update label for this dataset - :param str label: New label - :return str: The new label, as returned by get_label + :param str label: New label + :return str: The new label, as returned by get_label """ self.parameters["label"] = label @@ -1798,6 +1594,332 @@ def warn_unmappable_item(self, item_count, processor=None, error_message=None, w # No other log available raise DataSetException(f"Unable to map item {item_count} for dataset {closest_dataset.key} and properly warn") + # Annotation features + def get_annotations(self): + """ + Retrieves the annotations for this dataset. + return list: All annotations, each in their own dictionary. + """ + annotations = self.db.fetchall("SELECT * FROM annotations WHERE dataset = %s;", (self.key,)) + + if not annotations: + annotations = None + + return annotations + + def has_annotations(self): + """ + Returns True if there's one or more annotations found + """ + + annotation = self.db.fetchone("SELECT * FROM annotations WHERE dataset = %s;", (self.key,)) + + return True if annotation else False + + def has_annotation_fields(self): + """ + Returns True if there's annotation fields saved tot the dataset table + """ + + annotation_fields = self.get_annotation_fields() + + return True if annotation_fields else False + + def save_annotation_fields(self, new_fields, add=False): + """ + Save annotation field data to the datasets table (in the `annotation_fields` column). + If changes to the annotation fields affect existing annotations, + this function will also call `update_annotations_via_fields()` to change them. + + :param dict new_fields: Annotation fields, with a field ID as key. + + :param bool add: Wether we're merely adding new fields + or replacing the whole batch. If add is false, + `new_fields` should contain all fields. + + :return int: The number of annotation fields saved. + + """ + + # Get existing annotation fields to see if stuff changed. + old_fields = self.get_annotation_fields() + changes = False + + # Do some validation + # Annotation field must be valid JSON. + try: + s = json.dumps(new_fields) + except ValueError: + raise AnnotationException("Can't save annotation fields: not valid JSON (%s)" % new_fields) + + # Annotation fields must at minimum have `type` and `label` keys. + for field_id, annotation_field in new_fields.items(): + if not isinstance(field_id, str): + raise AnnotationException("Can't save annotation fields: field ID %s is not a valid string" % field_id) + if not "label" in annotation_field: + raise AnnotationException("Can't save annotation fields: all fields must have a label" % field_id) + if not "type" in annotation_field: + raise AnnotationException("Can't save annotation fields: all fields must have a type" % field_id) + + # Keep track of whether existing fields have changed; if so, we're going to + # update the annotations table. + if field_id in old_fields: + if old_fields[field_id] != annotation_field: + changes = True + + # If we're just adding fields, add them to the old fields + # If the field already exists, overwrite the old field. + if add and old_fields: + all_fields = old_fields + for field_id, annotation_field in new_fields.items(): + all_fields[field_id] = annotation_field + new_fields = all_fields + + # We're saving the new annotation fields as-is. + # Ordering of fields is preserved this way. + self.db.execute("UPDATE datasets SET annotation_fields = %s WHERE key = %s;", (json.dumps(new_fields), self.key)) + + # If we're adding but the field already exists, update/delete annotations with that ID. + add_and_overlap = add and any([True for k in list(new_fields.keys()) if k in old_fields]) + + if changes or add_and_overlap: + update_annotations_via_fields(old_fields, new_fields) + + return len(new_fields) + + def update_annotations_via_fields(self, old_fields, new_fields): + """ + Updates annotations in the annotations table if the input field + itself has been changed, for instance if a dropdown label is renamed. + + :param di old_fields: Old annotation fields + :param di new_fields: New annotation fields; this should contain not just + additions, but all fields, changed or otherwise. + + """ + + new_field_ids = set(annotation_fields.keys()) + text_fields = ["textarea", "text"] + + # If old and new fields are identical, do nothing. + if old_fields == new_fields: + return + + # Only update annotations if they, in fact, exist. + annotations = self.get_annotations() + if not annotations: + return + + fields_to_delete = set() # Delete all annotations with this field ID + fields_to_update = {} # Update values of annotations with this field ID + + # Loop through the old annotation fields + for old_field_id, old_field in old_fields.items(): + + # Delete all annotations of this type if the field is deleted. + if old_field_id not in new_fields: + fields_to_delete.add(old_field_id) + continue + + new_field = annotation_fields[old_field_id] + + # If the annotation type has changed, also delete existing annotations, + # except between text and textarea, where we can just change the type and keep the text. + if old_field["type"] != new_field["type"]: + if not old_field["type"] in text_fields and not new_field["type"] in text_fields: + fields_to_delete.add(field_id) + continue + + # Loop through all the key/values in the new field data + # and update in case it's different from the old values. + update_data = {} + for field_key, field_value in new_field.items(): + + # Update if values don't match + if field_value != old_field.get(field_key): + + # Special case: option values that are removed/renamed. + # Here we only have to change specific values within the + # values column. + if field_key == "options": + + new_options = field_value + # Delete annotations of this type if all option fields are deleted + # (even though this should not be possible in the Explorer front-end) + if not new_options: + fields_to_delete.add(field_id) + continue + + old_options = old_field["options"] + + options_to_update = {} + + # Options are saved in a dict with IDs and labels as keys/values. + for old_option_id, old_option in old_options.items(): + # Renamed option label + if old_option_id in new_options and old_option != new_options[old_option_id]: + options_to_update[old_option] = new_options[option] # Old label -> new label + # Deleted option + elif old_option_id not in new_options: + options_to_update[old_option] = None # Remove None labels + + if options_to_update: + update_data[field_key] = {} + update_data[field_key]["options"] = options_to_update + + # For all other changes, just overwrite with new data. + else: + update_data[field_key] = field_value + + if update_data: + fields_to_update[new_field_id] = update_data + + # Delete annotations + if fields_to_delete: + self.delete_annotations(field_id=list(fields_to_delete)) + + # Change annotations based on changes in update fields + if fields_to_update: + new_annotations = [] + for annotation in annotations: + if annotation["field_id"] in fields_to_update: + for k, update_field in fields_to_update[annotation["field_id"]]: + + # Special case: Changed options + if k == "options": + new_values = [] + for inserted_option in annotations["value"].split(","): + if inserted_option in update_field: + if update_field[inserted_option] == None: + # Don't add + continue + elif inserted_option in update_field: + # Replace with new value + new_values.append(annotation["value"][old_option]) + else: + # Keep old value + new_values.append(inserted_option) + + update_field = new_values + + annotation[k] = update_field + + new_annotations.append(annotation) + + # Save updated annotations + self.save_annotations(new_annotations) + + def save_annotations(self, annotations, overwrite=True): + """ + Takes a list of annotations and saves them to the annotations table. + If a field is not yet present in the datasets table, it also adds it there. + + :param list annotations: List of dictionaries with annotation items. + :param bool overwrite: Whether to overwrite annotation if the label is already present + for the dataset. + + :returns int: How many annotations were saved. + + """ + + # Should be present for all annotation fields + mandatory_keys = ["post_id", "label", "value"] + + field_keys = {} + annotations_to_delete = set() + + # We're going to add the annotation metadata to the datasets table + # based on the annotations themselves. + annotation_fields = self.get_annotation_fields() + existing_annotations = self.get_annotations() + existing_labels = set(a["label"] for a in existing_annotations) if existing_annotations else [] + + timestamp = time.time() + + new_annotations = [] + for annotation in annotations: + + # Do some validation; dataset key, post_id, label, and value need to be present. + missing_keys = [] + for mandatory_key in mandatory_keys: + if mandatory_key not in annotation: + missing_keys.append(mandatory_key) + if missing_keys: + raise AnnotationException("Couldn't add annotations; missing field(s) %s" % ",".join(missing_keys)) + + # Add dataset key + annotation["dataset"] = self.key + + # Raise exception if this label is already present for this dataset + # and we're not overwriting + if not overwrite and annotation["label"] in existing_labels: + raise AnnotationException("Couldn't save annotations; label %s already present") + + # If there's no type given, use 'text' + if not annotation.get("type"): + annotation["type"] = "text" + + # If there's no timestamp given, set it to the current time. + if not "timestamp" in annotation: + annotation["timestamp"] = timestamp + annotation["timestamp_created"] = timestamp + + # If not already given, create an ID for this annotation + # based on the label, type, and dataset key. + if "field_id" not in annotation: + field_id_base = "-".join(annotation["dataset"], annotation["label"], annotation.get("type", "")) + field_id = int.from_bytes(field_id_base.encode(), "little") + annotation["field_id"] = field_id + + # Add annotation metadata if it is not saved to the datasets table yet. + # This is just a simple dict with a field ID, type, label, and possible options. + if annotation["field_id"] not in annotation_fields: + annotation_fields[annotation["field_id"]] = { + "label": annotation["label"], + "type": annotation["type"] + } + if "options" in annotation: + annotation_fields[annotation["field_id"]]["options"] = annotation["options"] + + new_annotations.append(annotation) + + # Save annotation fields if they're not present yet. + if annotation_fields != self.get_annotation_fields(): + self.save_annotation_fields(annotation_fields) + + # If there's nothing to save or delete, do nothing + if not new_annotations: + return 0 + + # Overwrite old annotations with upsert. Else add. + self.db.upsert("annotations", new_annotations, constraints=["dataset", "post_id", "label"]) + + return len(new_annotations) + + def delete_annotations(self, dataset_key=None, id=None, field_id=None): + """ + Deletes all annotations for an entire dataset or by a list of (field) IDs. + + :param str dataset_key: A dataset key. + :param li id: A list or string of unique annotation IDs. + :param li field_id: A list or string of IDs for annotation fields. + + :return int: The number of removed records. + """ + + if not dataset and not ids and not field_ids: + return 0 + + where = {} + if dataset_key: + where["dataset"] = dataset_key + if ids: + where["id"] = ids + if field_ids: + where["field_id"] = field_ids + + return self.db.delete("annotations", where) + def __getattr__(self, attr): """ Getter so we don't have to use .data all the time diff --git a/common/lib/exceptions.py b/common/lib/exceptions.py index 01bd9813f..f187b4258 100644 --- a/common/lib/exceptions.py +++ b/common/lib/exceptions.py @@ -44,6 +44,11 @@ class ProcessorException(FourcatException): """ pass +class AnnotationException(FourcatException): + """ + Raise for exceptions with setting/getting annotations. + """ + pass class MapItemException(ProcessorException): """ diff --git a/helper-scripts/migrate/migrate-1.45-1.46.py b/helper-scripts/migrate/migrate-1.45-1.46.py index 2246d0ea2..0a650773e 100644 --- a/helper-scripts/migrate/migrate-1.45-1.46.py +++ b/helper-scripts/migrate/migrate-1.45-1.46.py @@ -21,7 +21,37 @@ db = Database(logger=log, dbname=db_config["db_name"], user=db_config["db_user"], password=db_config["db_password"], host=db_config["db_host"], port=db_config["db_port"], appname="4cat-migrate") -print(" Creating new annotations table...") + +datasets = db.fetchall("SELECT * FROM datasets WHERE annotation_fields != ''") + +print(" Converting annotation options from lists to dicts...") +for dataset in datasets: + + annotation_fields = dataset["annotation_fields"] + + # Flatten options from list of dicts to dict + options_converted = False + annotation_fields = json.loads(annotation_fields) + new_annotation_fields = annotation_fields + + for field_id, annotation_field in annotation_fields.items(): + + if "options" in annotation_field: + + flattened_options = {} + if isinstance(annotation_field["options"], list): + for op in annotation_field["options"]: + flattened_options.update(op) + new_annotation_fields[field_id]["options"] = flattened_options + options_converted = True + + if options_converted: + print(" Converting annotation options to list for dataset %s..." % dataset["key"]) + db.execute("UPDATE datasets SET annotation_fields = %s WHERE key = %s;", (json.dumps(new_annotation_fields), dataset["key"])) + +print(" Expanding the 'annotations' table.") + +print(" Creating new annotations table...") db.execute(""" CREATE TABLE IF NOT EXISTS annotations_new ( id SERIAL PRIMARY KEY, @@ -35,12 +65,12 @@ options TEXT, value TEXT, author TEXT, - is_processor BOOLEAN DEFAULT FALSE, + by_processor BOOLEAN DEFAULT FALSE, metadata TEXT ); """) -print(" Creating indexes for annotations table...") +print(" Creating indexes for annotations table...") db.execute(""" CREATE UNIQUE INDEX IF NOT EXISTS annotation_id ON annotations_new ( @@ -62,42 +92,42 @@ ); """) -print(" Transferring old annotations to new annotations table...") +print(" Transferring old annotations to new annotations table...") annotations = db.fetchall("SELECT * FROM annotations;") if not annotations: - print(" No annotation fields to transfer, skipping...") + print(" No annotation fields to transfer, skipping...") else: - print(" Transferring annotations") count = 0 skipped_count = 0 - columns = "post_id,field_id,dataset,timestamp,timestamp_created,label,type,options,value,author,is_processor,metadata" + columns = "post_id,field_id,dataset,timestamp,timestamp_created,label,type,options,value,author,by_processor,metadata" # Each row are **all** annotations per dataset for row in annotations: - - if not row.get("annotations"): - print(" No annotations for dataset %s, skipping..." % row["key"]) - skipped_count += 1 - continue - dataset = db.fetchone("SELECT * FROM datasets WHERE key = '" + row["key"] + "';") + dataset = db.fetchone("SELECT * FROM datasets WHERE key = '" + row["dataset"] + "';") # If the dataset is not present anymore, # we're going to skip these annotations; # likely the dataset is expired. if not dataset: - print(" No dataset found for key %s, skipping..." % row["key"]) + print(" No dataset found for key %s, skipping..." % row["dataset"]) skipped_count += 1 continue annotation_fields = json.loads(dataset["annotation_fields"]) author = dataset.get("creator", "") + + if not row.get("annotations"): + print(" No annotations for dataset %s, skipping..." % row["dataset"]) + skipped_count += 1 + continue + # Loop through all annotated posts for post_id, post_annotations in json.loads(row["annotations"]).items(): @@ -112,7 +142,7 @@ # Skip if this field was not saved to the datasets table if not field_id or field_id not in annotation_fields: - print(" Annotation field ID not saved to datasets table, skipping...") + print(" Annotation field ID not saved to datasets table, skipping...") skipped_count += 1 continue @@ -126,7 +156,7 @@ inserts = [( str(post_id), # post_id; needs to be a string, changes per data source. int(field_id), # field_id; this is an ID for the same type of input field. - row["key"], # dataset + row["dataset"], # dataset dataset["timestamp"], # timestamp dataset["timestamp"], # timestamp_created label, # label @@ -134,7 +164,7 @@ json.dumps(options) if options else "", # options; each option has a key and a value. value, # value author, # author - False, # is_processor + False, # by_processor json.dumps({}), # metadata )] @@ -143,9 +173,10 @@ count += 1 if count % 10 == 0: - print(" Transferred %s annotations..." % count) + print(" Transferred %s annotations..." % count) -print(" Done, transferred %s annotations and skipped %s annotations" % (count, skipped_count)) + print(" Done, transferred %s annotations and skipped %s annotations" % (count, skipped_count)) + print(" Deleting old annotations table...") db.execute("DROP TABLE annotations") diff --git a/webtool/static/js/explorer.js b/webtool/static/js/explorer.js index c6a9daab7..75f97a515 100644 --- a/webtool/static/js/explorer.js +++ b/webtool/static/js/explorer.js @@ -215,8 +215,10 @@ const annotations = { }, parseAnnotationFields: function (e) { - // Validates and converts the fields in the annotations editor. - // Returns an object with the set annotation fields. + /* + Validates and converts the fields in the annotations editor. + Returns an object with the set annotation fields. + */ var annotation_fields = {}; var warning = ""; @@ -237,7 +239,7 @@ const annotations = { let label = label_field.val().replace(/\s+/g, ' '); - // Get the random identifier of the field, so we + // Get the ID of the field, so we // can later check if it already exists. let field_id = parseInt(this.id.split("-")[1]); @@ -246,7 +248,7 @@ const annotations = { label_field.addClass("invalid"); warning = "Input names can't be empty"; } - // Make sure the names can't be duplicates. + // Make sure the names can't be duplicates else if (labels_added.includes(label)) { warning = "Field labels must be unique"; label_field.addClass("invalid"); @@ -280,7 +282,7 @@ const annotations = { if (!option_labels.includes(option_label) && option_label.length > 0) { - // We're using a unique key for these to match input fields. + // We're using a unique key for options as well. option = {} option[option_id] = option_label options.push(option); @@ -317,6 +319,74 @@ const annotations = { return annotation_fields; }, + parseAnnotation: function(e) { + /* + Converts the DOM objects of an annotation field + to an annotation Object. + + Must be given an input field element + + */ + + annotation = {} + + let label = $(this).find(".annotation-label").text(); + let annotation_type = $(this).attr("class").split(" ").pop(); + let val = undefined; + let edited = false + let timestamp = Date.now() / 100 + + if (annotation_type == "text" || annotation_type == "textarea") { + val = $(this).find(".post-annotation-input").val(); + // It can be the case that the input text is deleted + // In this case we *do* want to push new data, so we check + // whether there's an 'edited' class present and save if so. + if ($(this).find(".post-annotation-input").hasClass("edited")) { + edited = true + } + } + else if (annotation_type == "dropdown") { + let selected = $(this).find(".post-annotation-options").val(); + val = selected; + } + else if (annotation_type == "checkbox") { + val = []; + $(this).find(".post-annotation-options > input").each(function(){ + if ($(this).is(":checked")) { + val.push($(this).val()); + } + if ($(this).hasClass("edited")) { + edited = true + } + }); + if (!val.length > 0) { + val = undefined; + } + } + if ((val != undefined && val != "") || edited) { + vals_changed = true; + val = ""; + } + + // Create an annotation object and add them to the array. + let annotation = { + "field_id": "", + "post_id": post_id, + "dataset": "", + "timestamp": timestamp, + "timestamp_created": "", + "label": label, + "type": annotation_type, + "options": , + "value": "", + "author": "", + "by_processor": "", + "metadata": "" + } + + return annotation + }, + applyAnnotationFields: function (e){ // Applies the annotation fields to each post on this page. @@ -631,9 +701,9 @@ const annotations = { saveAnnotations: function (e){ // Write the annotations to the dataset and annotations table. - // First we're gonna collect the data for this page. - // Loop through each post's annotation field. - var anns = {}; + // First we're going to collect the data for this page. + // Loop through each post's annotation fields. + var anns = []; var dataset_key = $("#dataset-key").text(); $(".posts > li").each(function(){ @@ -644,50 +714,20 @@ const annotations = { if (post_annotations.length > 0) { - let post_vals = {}; post_annotations.find(".post-annotation").each(function(){ - let label = $(this).find(".annotation-label").text(); - let annotation_type = $(this).attr("class").split(" ").pop(); - let val = ""; - let edited = false - - if (annotation_type == "text" || annotation_type == "textarea") { - val = $(this).find(".post-annotation-input").val(); - // It can be the case that the input text is deleted - // In this case we *do* want to push new data, so we check - // whether there's an 'edited' class present and save if so. - if ($(this).find(".post-annotation-input").hasClass("edited")) { - edited = true - } - } - else if (annotation_type == "dropdown") { - let selected = $(this).find(".post-annotation-options").val(); - val = selected; - } - else if (annotation_type == "checkbox") { - val = []; - $(this).find(".post-annotation-options > input").each(function(){ - if ($(this).is(":checked")) { - val.push($(this).val()); - } - if ($(this).hasClass("edited")) { - edited = true - } - }); - if (!val.length > 0) { - val = undefined; - } - } - if ((val != undefined && val != "") || edited) { - vals_changed = true; - post_vals[label] = val; + // Extract annotation object from the element + let annotation = parseAnnotation(this); + + if (annotation) { + annotations.push(annotation); } }); if (vals_changed){ - anns[post_id] = post_vals; + annotation[post_id] = post_vals; } + } }) diff --git a/webtool/views/views_explorer.py b/webtool/views/views_explorer.py index 18d657387..466735a0f 100644 --- a/webtool/views/views_explorer.py +++ b/webtool/views/views_explorer.py @@ -132,11 +132,9 @@ def explorer_dataset(key, page=1): # Check whether there's already annotations inserted already. # If so, also pass these to the template. - annotations = db.fetchone("SELECT * FROM annotations WHERE key = %s", (key,)) - if not annotations or not annotations.get("annotations"): - annotations = None - else: - annotations = json.loads(annotations["annotations"]) + annotations = db.fetchall("SELECT * FROM annotations WHERE key = %s", (key,)) + if annotations: + annotations = json.loads(annotations) # Generate the HTML page return render_template("explorer/explorer.html", dataset=dataset, datasource=datasource, has_database=has_database, posts=posts, annotation_fields=annotation_fields, annotations=annotations, template=template, posts_css=posts_css, page=page, offset=offset, posts_per_page=posts_per_page, post_count=post_count, max_posts=max_posts, warning=warning) @@ -221,7 +219,7 @@ def explorer_api_posts(datasource, post_ids): @openapi.endpoint("explorer") def explorer_save_annotation_fields(key): """ - Save teh annotation fields of a dataset to the datasets table. + Save the annotation fields of a dataset to the datasets table. :param str key: The dataset key. @@ -255,7 +253,7 @@ def explorer_save_annotations(key): :param str key: The dataset key. - :return-error 404: If the dataset ID does not exist. + :return-error 404: If the dataset key does not exist. :return int: The number of posts with annotations saved. """ From e95c4bd33bffc09c15ca0a5f684c2b3415cb0815 Mon Sep 17 00:00:00 2001 From: sal-phd-desktop Date: Wed, 7 Aug 2024 12:47:50 +0200 Subject: [PATCH 117/204] remove unused variables in explorer.js --- webtool/static/js/explorer.js | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/webtool/static/js/explorer.js b/webtool/static/js/explorer.js index c6a9daab7..ae317e82d 100644 --- a/webtool/static/js/explorer.js +++ b/webtool/static/js/explorer.js @@ -74,7 +74,7 @@ const annotations = { // Delete an entire annotation input // We're in a grid of threes, so this involves three divs - editor_controls.on("click", ".annotation-field > .delete-input", function(e){ + editor_controls.on("click", ".annotation-field > .delete-input", function(){ let parent_div = $(this).parent().parent(); parent_div.next().remove(); // Input type parent_div.next().remove(); // Options @@ -105,11 +105,12 @@ const annotations = { }); // Make saving available when annotations are changed - $(".post-annotations").on("keydown", "input, textarea", function() { annotations.enableSaving(); edits_made = true;}); - $(".post-annotations").on("click", "option, input[type=checkbox], label", function() { annotations.enableSaving(); edits_made = true;}); + let post_annotations = $(".post-annotations"); + post_annotations.on("keydown", "input, textarea", function() { annotations.enableSaving(); edits_made = true;}); + post_annotations.on("click", "option, input[type=checkbox], label", function() { annotations.enableSaving(); edits_made = true;}); // Keep track of whether the annotations are edited or not. - $(".post-annotations").on("keydown change", ".post-annotation-input, .post-annotation input, .post-annotation textarea", function(){$(this).addClass("edited")}); + post_annotations.on("keydown change", ".post-annotation-input, .post-annotation input, .post-annotation textarea", function(){$(this).addClass("edited")}); // Save the annotations to the database $("#save-annotations").on("click", function(){ @@ -125,8 +126,6 @@ const annotations = { } }) - var old_annotation_fields = $("#annotation-fields").html(); - // Check whether there's already fields saved for this dataset annotations.fieldsExist(); @@ -143,7 +142,6 @@ const annotations = { // Change the type of input fields when switching in the dropdown let type = $(el).val(); - let old_type = $(el).attr("data-val"); let options = $(el).parent().parent().next(); let option_fields = options.find(".option-field"); From 2fdb87640140ef8ec591185e2d15f8ce1599b9aa Mon Sep 17 00:00:00 2001 From: sal-phd-desktop Date: Wed, 7 Aug 2024 17:52:24 +0200 Subject: [PATCH 118/204] First steps in giving annotations their own class --- backend/database.sql | 4 +- common/lib/annotation.py | 265 ++++++++++++++++++++ common/lib/dataset.py | 164 ++++-------- helper-scripts/migrate/migrate-1.45-1.46.py | 1 - processors/metrics/count_posts.py | 7 + webtool/static/js/explorer.js | 14 +- webtool/views/views_explorer.py | 23 +- 7 files changed, 343 insertions(+), 135 deletions(-) create mode 100644 common/lib/annotation.py diff --git a/backend/database.sql b/backend/database.sql index 7e551e5d8..eafc9aaf8 100644 --- a/backend/database.sql +++ b/backend/database.sql @@ -68,9 +68,9 @@ CREATE TABLE datasets_owners ( CREATE UNIQUE INDEX datasets_owners_user_key_idx ON datasets_owners("name" text_ops,key text_ops); -- annotations -CREATE TABLE IF NOT EXISTS annotations_new ( +CREATE TABLE IF NOT EXISTS annotations ( id SERIAL PRIMARY KEY, - post_id TEXT, + item_id TEXT, dataset TEXT, timestamp INT DEFAULT 0, timestamp_created INT DEFAULT 0, diff --git a/common/lib/annotation.py b/common/lib/annotation.py new file mode 100644 index 000000000..771e28343 --- /dev/null +++ b/common/lib/annotation.py @@ -0,0 +1,265 @@ +""" +Annotation class +""" + +from common.config_manager import config + +class Annotation: + """ + Annotation class + + Annotations are always tied to a dataset and an item ID. + + """ + + # Attributes must be created here to ensure getattr and setattr work properly + + data = None + db = None + + id = "" # Unique ID for this annotation + parent_id = "" # ID of the data for this annotation, e.g. post ID + dataset = "" # Dataset key this annotation is generated from + timestamp = 0 # When this annotation was edited + timestamp_created = 0 # When this timestamp was created + label = "" # Label of annotation + options = [] # Possible options + value = "" # The actual annotation value + author = "" # Who made the annotation + by_processor = False # Whether the annotation was made by a processor + metadata = {} # Misc metadata + + def __init__(self, db, data, id=None, item_id=None, label=None, dataset_key=None): + """ + Instantiate annotation object. + + :param db: Database connection object + :param dict data: Annotation data; should correspond to the annotations table records. + + """ + + self.db = db + self.data = data + self.item_id = item_id + + if id is not None: + self.id = id + current = self.db.fetchone("SELECT * FROM annotations WHERE key = %s", (self.id,)) + if not current: + raise AnnotationException( + "Annotation() requires a valid ID for its 'id' argument, \"%s\" given" % id) + + # Should be present for all annotation fields + mandatory_keys = ["post_id", "label", "value"] + + + if dataset_key is not None and label is not None and dataset_key is not None: + current = self.db.fetchone("SELECT * FROM annotations WHERE key = %s", (self.key,)) + if not current: + raise DataSetNotFoundException( + "DataSet() requires a valid dataset key for its 'key' argument, \"%s\" given" % key) + + + def get_by_id(db, id): + """ + Get annotation by ID + + :param db: Database connection object + :param str name: ID of annotation + :return: Annotation object, or `None` for invalid annotation ID + """ + data = db.fetchone("SELECT * FROM annotations WHERE id = %s", (id,)) + if not annotation: + return None + else: + return Annotation.get_by_data(db, data) + + def get_by_data(db, data): + """ + Instantiate annotation object with given data + + :param db: Database handler + :param dict data: Annotation data, should correspond to a database row + :return Annotation: Annotation object + """ + return Annotation(db, data) + + def set_id_by_data(self, item): + """ + Creates an ID based on the data of the item it has annotated. + + + """ + + + return True + + def save(self): + """ + Save an annotation to the database. + """ + return True + + @staticmethod + def save_many(self, annotations, overwrite=True): + """ + Takes a list of annotations and saves them to the annotations table. + If a field is not yet present in the datasets table, it also adds it there. + + :param bool overwrite: Whether to overwrite annotation if the label is already present + for the dataset. + + :returns int: How many annotations were saved. + + """ + + field_keys = {} + annotations_to_delete = set() + + # We're going to add the annotation metadata to the datasets table + # based on the annotations themselves. + annotation_fields = self.get_annotation_fields() + existing_annotations = self.get_annotations() + existing_labels = set(a["label"] for a in existing_annotations) if existing_annotations else [] + + timestamp = time.time() + + new_annotations = [] + for annotation in annotations: + + # Do some validation; dataset key, post_id, label, and value need to be present. + missing_keys = [] + for mandatory_key in mandatory_keys: + if mandatory_key not in annotation: + missing_keys.append(mandatory_key) + if missing_keys: + raise AnnotationException("Couldn't add annotations; missing field(s) %s" % ",".join(missing_keys)) + + # Add dataset key + annotation["dataset"] = self.key + + # Raise exception if this label is already present for this dataset + # and we're not overwriting + if not overwrite and annotation["label"] in existing_labels: + raise AnnotationException("Couldn't save annotations; label %s already present") + + # If there's no type given, use 'text' + if not annotation.get("type"): + annotation["type"] = "text" + + # If there's no timestamp given, set it to the current time. + if not "timestamp" in annotation: + annotation["timestamp"] = timestamp + annotation["timestamp_created"] = timestamp + + # If not already given, create an ID for this annotation + # based on the label, type, and dataset key. + if "field_id" not in annotation: + field_id_base = "-".join(annotation["dataset"], annotation["label"], annotation.get("type", "")) + field_id = int.from_bytes(field_id_base.encode(), "little") + annotation["field_id"] = field_id + + # Add annotation metadata if it is not saved to the datasets table yet. + # This is just a simple dict with a field ID, type, label, and possible options. + if annotation["field_id"] not in annotation_fields: + annotation_fields[annotation["field_id"]] = { + "label": annotation["label"], + "type": annotation["type"] + } + if "options" in annotation: + annotation_fields[annotation["field_id"]]["options"] = annotation["options"] + + new_annotations.append(annotation) + + # Save annotation fields if they're not present yet. + if annotation_fields != self.get_annotation_fields(): + self.save_annotation_fields(annotation_fields) + + # If there's nothing to save or delete, do nothing + if not new_annotations: + return 0 + + # Overwrite old annotations with upsert. Else add. + self.db.upsert("annotations", new_annotations, constraints=["dataset", "post_id", "label"]) + + return len(new_annotations) + + def delete(self): + """ + Deletes this annotation + """ + return self.db.delete("annotations", {"id": self.id}) + + @staticmethod + def delete_many(self, dataset_key=None, id=None, field_id=None): + """ + Deletes annotations for an entire dataset or by a list of (field) IDs. + + :param str dataset_key: A dataset key. + :param li id: A list or string of unique annotation IDs. + :param li field_id: A list or string of IDs for annotation fields. + + :return int: The number of removed records. + """ + if not dataset_key and not id and not field_id: + return 0 + + where = {} + if dataset_key: + where["dataset"] = dataset_key + if id: + where["id"] = id + if field_id: + where["field_id"] = field_id + + return self.db.delete("annotations", where) + + + def __getattr__(self, attr): + """ + Getter so we don't have to use .data all the time + + :param attr: Data key to get + :return: Value + """ + + if attr in dir(self): + # an explicitly defined attribute should always be called in favour + # of this passthrough + attribute = getattr(self, attr) + return attribute + elif attr in self.data: + return self.data[attr] + else: + raise AttributeError("Annotation instance has no attribute %s" % attr) + + def __setattr__(self, attr, value): + """ + Setter so we can flexibly update the database + + Also updates internal data stores (.data etc). If the attribute is + unknown, it is stored within the 'metadata' attribute. + + :param str attr: Attribute to update + :param value: New value + """ + + # don't override behaviour for *actual* class attributes + if attr in dir(self): + super().__setattr__(attr, value) + return + + if attr not in self.data: + self.parameters[attr] = value + attr = "metadata" + value = self.parameters + + if attr == "metadata": + value = json.dumps(value) + + self.db.update("annotations", where={"id": self.id}, data={attr: value}) + + self.data[attr] = value + + if attr == "metadata": + self.parameters = json.loads(value) \ No newline at end of file diff --git a/common/lib/dataset.py b/common/lib/dataset.py index 1418494e1..92dcfc625 100644 --- a/common/lib/dataset.py +++ b/common/lib/dataset.py @@ -10,13 +10,12 @@ import csv import re -from pathlib import Path - import backend from common.config_manager import config +from common.lib.annotation import Annotation from common.lib.job import Job, JobNotFoundException -from common.lib.helpers import get_software_commit, NullAwareTextIOWrapper, convert_to_int, convert_to_float, flatten_dict -from common.lib.item_mapping import MappedItem, MissingMappedField, DatasetItem +from common.lib.helpers import get_software_commit, NullAwareTextIOWrapper, convert_to_int +from common.lib.item_mapping import MappedItem, DatasetItem from common.lib.fourcat_module import FourcatModule from common.lib.exceptions import (ProcessorInterruptedException, DataSetException, DataSetNotFoundException, MapItemException, MappedItemIncompleteException, AnnotationException) @@ -83,20 +82,17 @@ def __init__(self, parameters=None, key=None, job=None, data=None, db=None, pare if not current: raise DataSetNotFoundException("DataSet() requires a valid dataset key for its 'key' argument, \"%s\" given" % key) - query = current["query"] elif job is not None: current = self.db.fetchone("SELECT * FROM datasets WHERE parameters::json->>'job' = %s", (job,)) if not current: raise DataSetNotFoundException("DataSet() requires a valid job ID for its 'job' argument") - query = current["query"] self.key = current["key"] elif data is not None: current = data if "query" not in data or "key" not in data or "parameters" not in data or "key_parent" not in data: raise DataSetException("DataSet() requires a complete dataset record for its 'data' argument") - query = current["query"] self.key = current["key"] else: if parameters is None: @@ -217,7 +213,7 @@ def clear_log(self): extension. """ log_path = self.get_log_path() - with log_path.open("w") as outfile: + with log_path.open("w"): pass def log(self, log): @@ -347,7 +343,7 @@ def iterate_items(self, processor=None, warn_unmappable=True, map_missing="defau if annotation_fields: annotations = self.get_annotations() - # missing field strategy can be for all fields at once, or per field + # missing field strategy can be for all fields at once, or per field # if it is per field, it is a dictionary with field names and their strategy # if it is for all fields, it is may be a callback, 'abort', or 'default' default_strategy = "default" @@ -832,23 +828,6 @@ def get_columns(self): # Filetype not CSV or an NDJSON with `map_item` return [] - def get_annotation_fields(self): - """ - Retrieves the saved annotation fields for this dataset. - These are stored in the annotations table. - - :return dict: The saved annotation fields. - """ - - annotation_fields = self.db.fetchone("SELECT annotation_fields FROM datasets WHERE key = %s;", (self.key,)) - - if annotation_fields and annotation_fields.get("annotation_fields"): - annotation_fields = json.loads(annotation_fields["annotation_fields"]) - else: - annotation_fields = {} - - return annotation_fields - def update_label(self, label): """ Update label for this dataset @@ -908,7 +887,6 @@ def change_datasource(self, datasource): """ Change the datasource type for this dataset - :param str label: New datasource type :return str: The new datasource type """ @@ -1604,6 +1582,8 @@ def get_annotations(self): if not annotations: annotations = None + else: + annotations = [Annotation(data=annotation, dataset=self) for annotation in annotations] return annotations @@ -1625,15 +1605,32 @@ def has_annotation_fields(self): return True if annotation_fields else False + def get_annotation_fields(self): + """ + Retrieves the saved annotation fields for this dataset. + These are stored in the annotations table. + + :return dict: The saved annotation fields. + """ + + annotation_fields = self.db.fetchone("SELECT annotation_fields FROM datasets WHERE key = %s;", (self.key,)) + + if annotation_fields and annotation_fields.get("annotation_fields"): + annotation_fields = json.loads(annotation_fields["annotation_fields"]) + else: + annotation_fields = None + + return annotation_fields + def save_annotation_fields(self, new_fields, add=False): """ Save annotation field data to the datasets table (in the `annotation_fields` column). If changes to the annotation fields affect existing annotations, this function will also call `update_annotations_via_fields()` to change them. - :param dict new_fields: Annotation fields, with a field ID as key. + :param dict new_fields: New annotation fields, with a field ID as key. - :param bool add: Wether we're merely adding new fields + :param bool add: Whether we're merely adding new fields or replacing the whole batch. If add is false, `new_fields` should contain all fields. @@ -1648,7 +1645,7 @@ def save_annotation_fields(self, new_fields, add=False): # Do some validation # Annotation field must be valid JSON. try: - s = json.dumps(new_fields) + json.dumps(new_fields) except ValueError: raise AnnotationException("Can't save annotation fields: not valid JSON (%s)" % new_fields) @@ -1683,7 +1680,7 @@ def save_annotation_fields(self, new_fields, add=False): add_and_overlap = add and any([True for k in list(new_fields.keys()) if k in old_fields]) if changes or add_and_overlap: - update_annotations_via_fields(old_fields, new_fields) + self.update_annotations_via_fields(old_fields, new_fields) return len(new_fields) @@ -1698,7 +1695,6 @@ def update_annotations_via_fields(self, old_fields, new_fields): """ - new_field_ids = set(annotation_fields.keys()) text_fields = ["textarea", "text"] # If old and new fields are identical, do nothing. @@ -1721,7 +1717,8 @@ def update_annotations_via_fields(self, old_fields, new_fields): fields_to_delete.add(old_field_id) continue - new_field = annotation_fields[old_field_id] + field_id = old_field_id + new_field = new_fields[field_id] # If the annotation type has changed, also delete existing annotations, # except between text and textarea, where we can just change the type and keep the text. @@ -1758,7 +1755,7 @@ def update_annotations_via_fields(self, old_fields, new_fields): for old_option_id, old_option in old_options.items(): # Renamed option label if old_option_id in new_options and old_option != new_options[old_option_id]: - options_to_update[old_option] = new_options[option] # Old label -> new label + options_to_update[old_option] = new_options[old_option_id] # Old label -> new label # Deleted option elif old_option_id not in new_options: options_to_update[old_option] = None # Remove None labels @@ -1772,7 +1769,7 @@ def update_annotations_via_fields(self, old_fields, new_fields): update_data[field_key] = field_value if update_data: - fields_to_update[new_field_id] = update_data + fields_to_update[field_id] = update_data # Delete annotations if fields_to_delete: @@ -1782,7 +1779,7 @@ def update_annotations_via_fields(self, old_fields, new_fields): if fields_to_update: new_annotations = [] for annotation in annotations: - if annotation["field_id"] in fields_to_update: + if annotation.field_id in fields_to_update: for k, update_field in fields_to_update[annotation["field_id"]]: # Special case: Changed options @@ -1795,7 +1792,7 @@ def update_annotations_via_fields(self, old_fields, new_fields): continue elif inserted_option in update_field: # Replace with new value - new_values.append(annotation["value"][old_option]) + new_values.append(update_field[inserted_option]) else: # Keep old value new_values.append(inserted_option) @@ -1810,7 +1807,7 @@ def update_annotations_via_fields(self, old_fields, new_fields): self.save_annotations(new_annotations) def save_annotations(self, annotations, overwrite=True): - """ + """ Takes a list of annotations and saves them to the annotations table. If a field is not yet present in the datasets table, it also adds it there. @@ -1822,79 +1819,20 @@ def save_annotations(self, annotations, overwrite=True): """ - # Should be present for all annotation fields - mandatory_keys = ["post_id", "label", "value"] - - field_keys = {} - annotations_to_delete = set() - - # We're going to add the annotation metadata to the datasets table - # based on the annotations themselves. - annotation_fields = self.get_annotation_fields() - existing_annotations = self.get_annotations() - existing_labels = set(a["label"] for a in existing_annotations) if existing_annotations else [] - - timestamp = time.time() - - new_annotations = [] - for annotation in annotations: - - # Do some validation; dataset key, post_id, label, and value need to be present. - missing_keys = [] - for mandatory_key in mandatory_keys: - if mandatory_key not in annotation: - missing_keys.append(mandatory_key) - if missing_keys: - raise AnnotationException("Couldn't add annotations; missing field(s) %s" % ",".join(missing_keys)) - - # Add dataset key - annotation["dataset"] = self.key - - # Raise exception if this label is already present for this dataset - # and we're not overwriting - if not overwrite and annotation["label"] in existing_labels: - raise AnnotationException("Couldn't save annotations; label %s already present") - - # If there's no type given, use 'text' - if not annotation.get("type"): - annotation["type"] = "text" - - # If there's no timestamp given, set it to the current time. - if not "timestamp" in annotation: - annotation["timestamp"] = timestamp - annotation["timestamp_created"] = timestamp - - # If not already given, create an ID for this annotation - # based on the label, type, and dataset key. - if "field_id" not in annotation: - field_id_base = "-".join(annotation["dataset"], annotation["label"], annotation.get("type", "")) - field_id = int.from_bytes(field_id_base.encode(), "little") - annotation["field_id"] = field_id - - # Add annotation metadata if it is not saved to the datasets table yet. - # This is just a simple dict with a field ID, type, label, and possible options. - if annotation["field_id"] not in annotation_fields: - annotation_fields[annotation["field_id"]] = { - "label": annotation["label"], - "type": annotation["type"] - } - if "options" in annotation: - annotation_fields[annotation["field_id"]]["options"] = annotation["options"] - - new_annotations.append(annotation) - - # Save annotation fields if they're not present yet. - if annotation_fields != self.get_annotation_fields(): - self.save_annotation_fields(annotation_fields) - - # If there's nothing to save or delete, do nothing - if not new_annotations: + if not annotations: return 0 - # Overwrite old annotations with upsert. Else add. - self.db.upsert("annotations", new_annotations, constraints=["dataset", "post_id", "label"]) + # Add dataset info to annotations + key = self.key + owner = self.get_owners()[0] + if "dataset" + for i in range(len(annotations)): + if not annotations[i].get("dataset"): + annotations[i]["dataset"] = key + if not annotations[i].get("author"): + annotations[i]["author"] = owner - return len(new_annotations) + return Annotation.save_many(annotations, overwrite=overwrite) def delete_annotations(self, dataset_key=None, id=None, field_id=None): """ @@ -1907,16 +1845,16 @@ def delete_annotations(self, dataset_key=None, id=None, field_id=None): :return int: The number of removed records. """ - if not dataset and not ids and not field_ids: + if not dataset_key and not id and not field_id: return 0 where = {} if dataset_key: where["dataset"] = dataset_key - if ids: - where["id"] = ids - if field_ids: - where["field_id"] = field_ids + if id: + where["id"] = id + if field_id: + where["field_id"] = field_id return self.db.delete("annotations", where) diff --git a/helper-scripts/migrate/migrate-1.45-1.46.py b/helper-scripts/migrate/migrate-1.45-1.46.py index 0a650773e..e0248cfcf 100644 --- a/helper-scripts/migrate/migrate-1.45-1.46.py +++ b/helper-scripts/migrate/migrate-1.45-1.46.py @@ -122,7 +122,6 @@ annotation_fields = json.loads(dataset["annotation_fields"]) author = dataset.get("creator", "") - if not row.get("annotations"): print(" No annotations for dataset %s, skipping..." % row["dataset"]) skipped_count += 1 diff --git a/processors/metrics/count_posts.py b/processors/metrics/count_posts.py index 3114d4049..ea1ef48f5 100644 --- a/processors/metrics/count_posts.py +++ b/processors/metrics/count_posts.py @@ -3,6 +3,7 @@ """ from common.lib.helpers import UserInput, pad_interval, get_interval_descriptor +from common.lib.annotation import Annotation from backend.lib.processor import BasicProcessor __author__ = "Stijn Peeters" @@ -51,11 +52,17 @@ def process(self): first_interval = "9999" last_interval = "0000" + annotations = [] + self.dataset.update_status("Processing items") with self.dataset.get_results_path().open("w") as results: counter = 0 for post in self.source_dataset.iterate_items(self): + + annotation = Annotation(value="test", label="count_posts_test", dataset=self.source_dataset) + annotations.append(annotation) + try: date = get_interval_descriptor(post, timeframe) except ValueError as e: diff --git a/webtool/static/js/explorer.js b/webtool/static/js/explorer.js index 062782967..35104d28b 100644 --- a/webtool/static/js/explorer.js +++ b/webtool/static/js/explorer.js @@ -2,7 +2,7 @@ $(document).ready(function(){ $(init); -/** +/* * Page init */ function init() { @@ -15,7 +15,7 @@ function init() { } -/** +/* * Handle annotations */ const annotations = { @@ -334,7 +334,7 @@ const annotations = { let edited = false let timestamp = Date.now() / 100 - if (annotation_type == "text" || annotation_type == "textarea") { + if (annotation_type === "text" || annotation_type === "textarea") { val = $(this).find(".post-annotation-input").val(); // It can be the case that the input text is deleted // In this case we *do* want to push new data, so we check @@ -343,11 +343,11 @@ const annotations = { edited = true } } - else if (annotation_type == "dropdown") { + else if (annotation_type === "dropdown") { let selected = $(this).find(".post-annotation-options").val(); val = selected; } - else if (annotation_type == "checkbox") { + else if (annotation_type === "checkbox") { val = []; $(this).find(".post-annotation-options > input").each(function(){ if ($(this).is(":checked")) { @@ -361,7 +361,7 @@ const annotations = { val = undefined; } } - if ((val != undefined && val != "") || edited) { + if ((val !== undefined && val !== "") || edited) { vals_changed = true; val = ""; } @@ -375,7 +375,7 @@ const annotations = { "timestamp_created": "", "label": label, "type": annotation_type, - "options": , + "options": "", "value": "", "author": "", "by_processor": "", diff --git a/webtool/views/views_explorer.py b/webtool/views/views_explorer.py index 466735a0f..a3acb82d3 100644 --- a/webtool/views/views_explorer.py +++ b/webtool/views/views_explorer.py @@ -4,7 +4,6 @@ """ import json -import re from pathlib import Path @@ -13,6 +12,7 @@ from webtool import app, db, openapi, limiter, config from webtool.lib.helpers import error, setting_required from common.lib.dataset import DataSet +from common.lib.annotation import Annotation from common.lib.helpers import convert_to_float from common.lib.exceptions import DataSetException from common.config_manager import ConfigWrapper @@ -132,9 +132,7 @@ def explorer_dataset(key, page=1): # Check whether there's already annotations inserted already. # If so, also pass these to the template. - annotations = db.fetchall("SELECT * FROM annotations WHERE key = %s", (key,)) - if annotations: - annotations = json.loads(annotations) + annotations = dataset.get_annotations() # Generate the HTML page return render_template("explorer/explorer.html", dataset=dataset, datasource=datasource, has_database=has_database, posts=posts, annotation_fields=annotation_fields, annotations=annotations, template=template, posts_css=posts_css, page=page, offset=offset, posts_per_page=posts_per_page, post_count=post_count, max_posts=max_posts, warning=warning) @@ -247,7 +245,7 @@ def explorer_save_annotation_fields(key): @setting_required("privileges.can_run_processors") @setting_required("privileges.can_use_explorer") @openapi.endpoint("explorer") -def explorer_save_annotations(key): +def explorer_save_annotations(key=None): """ Save the annotations of a dataset to the annotations table. @@ -257,19 +255,20 @@ def explorer_save_annotations(key): :return int: The number of posts with annotations saved. """ - # Get dataset. + # Save it! + annotations = request.get_json() + + # Annotations are always associated with a dataset. + if not key and annotations: + key = annotations[0].get("dataset", "") if not key: return error(404, error="No dataset key provided") try: dataset = DataSet(key=key, db=db) except DataSetException: return error(404, error="Dataset not found.") - - # Save it! - new_annotations = request.get_json() - dataset.save_annotations(new_annotations) - - return "success" + + return dataset.save_annotations(annotations) def sort_and_iterate_items(dataset, sort=None, reverse=False, **kwargs): """ From 6ddae4e47eeb5b2857dea74b10791661bedea22b Mon Sep 17 00:00:00 2001 From: Sal Hagen Date: Thu, 8 Aug 2024 17:21:08 +0200 Subject: [PATCH 119/204] Fix mistakes in database.sql --- backend/database.sql | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/backend/database.sql b/backend/database.sql index eafc9aaf8..c474eb773 100644 --- a/backend/database.sql +++ b/backend/database.sql @@ -71,6 +71,7 @@ CREATE UNIQUE INDEX datasets_owners_user_key_idx ON datasets_owners("name" text_ CREATE TABLE IF NOT EXISTS annotations ( id SERIAL PRIMARY KEY, item_id TEXT, + field_id TEXT, dataset TEXT, timestamp INT DEFAULT 0, timestamp_created INT DEFAULT 0, @@ -84,21 +85,21 @@ CREATE TABLE IF NOT EXISTS annotations ( ); CREATE UNIQUE INDEX IF NOT EXISTS annotation_id - ON annotations_new ( + ON annotations ( id ); CREATE UNIQUE INDEX IF NOT EXISTS annotation_unique - ON annotations_new ( + ON annotations ( label, dataset, - post_id + item_id ); CREATE INDEX IF NOT EXISTS annotation_value - ON annotations_new ( + ON annotations ( value ); CREATE INDEX IF NOT EXISTS annotation_timestamp - ON annotations_new ( + ON annotations ( timestamp ); From f679b7aad3ac8abcd0dcec2fb94d76bfa20821ad Mon Sep 17 00:00:00 2001 From: Sal Hagen Date: Thu, 8 Aug 2024 17:21:28 +0200 Subject: [PATCH 120/204] Make Annotation object usable --- common/lib/annotation.py | 239 ++++++++++++++++++++++----------------- common/lib/dataset.py | 36 ++++-- 2 files changed, 163 insertions(+), 112 deletions(-) diff --git a/common/lib/annotation.py b/common/lib/annotation.py index 771e28343..8cda9d0dd 100644 --- a/common/lib/annotation.py +++ b/common/lib/annotation.py @@ -2,13 +2,18 @@ Annotation class """ -from common.config_manager import config + +import time +import json + +from common.lib.exceptions import AnnotationException class Annotation: """ Annotation class - Annotations are always tied to a dataset and an item ID. + Annotations are always tied to a dataset, a dataset item (e.g. a csv row), + an annotation label, and a type ('text', 'multichoice', etc.). """ @@ -17,88 +22,166 @@ class Annotation: data = None db = None - id = "" # Unique ID for this annotation - parent_id = "" # ID of the data for this annotation, e.g. post ID - dataset = "" # Dataset key this annotation is generated from - timestamp = 0 # When this annotation was edited - timestamp_created = 0 # When this timestamp was created - label = "" # Label of annotation - options = [] # Possible options - value = "" # The actual annotation value - author = "" # Who made the annotation - by_processor = False # Whether the annotation was made by a processor - metadata = {} # Misc metadata - - def __init__(self, db, data, id=None, item_id=None, label=None, dataset_key=None): + id = None # Unique ID for this annotation + item_id = None # ID of the item for this annotation, e.g. post ID + field_id = None # If of this type of annotation field for this dataset + dataset = None # Dataset key this annotation is generated from + timestamp = None # When this annotation was edited + timestamp_created = None # When this timestamp was created + label = None # Label of annotation + options = None # Possible options + value = None # The actual annotation value + author = None # Who made the annotation + by_processor = None # Whether the annotation was made by a processor + metadata = None # Misc metadata + + def __init__(self, data=None, id=None, db=None): """ Instantiate annotation object. - :param db: Database connection object - :param dict data: Annotation data; should correspond to the annotations table records. - + :param data: Annotation data; should correspond to the annotations table record. + :param id: The ID of an annotation. If given, it retrieves the annotation + from the database. + :param db: Database connection object """ - self.db = db - self.data = data - self.item_id = item_id + required_fields = ["label", "item_id", "dataset"] - if id is not None: - self.id = id - current = self.db.fetchone("SELECT * FROM annotations WHERE key = %s", (self.id,)) - if not current: - raise AnnotationException( - "Annotation() requires a valid ID for its 'id' argument, \"%s\" given" % id) + # Must have an ID or data + if id is None and (data is None or not isinstance(data, dict)): + raise AnnotationException("Annotation() requires either a `data` dictionary or ID.") - # Should be present for all annotation fields - mandatory_keys = ["post_id", "label", "value"] + if not db: + raise AnnotationException("Annotation() needs a `db` database object") + self.db = db - if dataset_key is not None and label is not None and dataset_key is not None: - current = self.db.fetchone("SELECT * FROM annotations WHERE key = %s", (self.key,)) + current = None + new_or_updated = False + + # Get the annotation data if the ID is given; if an annotation has + # an ID, it is guaranteed to be in the database. + # IDs can both be explicitly given or present in the data dict. + if id is not None or "id" in data: + if "id" in data: + id = data["id"] + self.id = id # IDs correspond to unique serial numbers in the database. + current = self.db.fetchone("SELECT * FROM annotations WHERE id = %s" % (self.id)) if not current: - raise DataSetNotFoundException( - "DataSet() requires a valid dataset key for its 'key' argument, \"%s\" given" % key) - + raise AnnotationException( + "Annotation() requires a valid ID for its 'id' argument, %s given" % id) - def get_by_id(db, id): + # If an ID is not given, get or create an Annotation object from its data. + # First check if required fields are present in `data`. + else: + for required_field in required_fields: + if required_field not in data or not data[required_field]: + raise AnnotationException("Annotation() requires a %s field" % required_field) + + # Check if this annotation already exists, based on the data + current = self.get_by_field(data["dataset"], data["item_id"], data["label"]) + + # If we were able to retrieve an annotation from the db, it already exists + if current: + # Check if we have to overwrite old data with new data + if data: + for key, value in data.items(): + # Save unknown fields in metadata + if key not in current: + current["metadata"][key] = value + new_or_updated = True + # Else update the value + elif current[key] != value: + current[key] = value + new_or_updated = True + + self.data = current + + # If this is a new annotation, set all the properties. + else: + # Keep track of when the annotation was made + created_timestamp = int(time.time()) + # Store unknown properties in `metadata` + metadata = {k: v for k, v in data.items() if k not in self.__dict__} + print(self.__dict__) + print(metadata) + new_data = { + "item_id": data["item_id"], + "field_id": data["field_id"] if data.get("field_id") else self.get_field_id(data["dataset"], data["label"]), + "dataset": data["dataset"], + "timestamp_created": timestamp, + "label": data["label"], + "type": data.get("type", "text"), + "options": data.get("options", ""), + "value": data.get("value", ""), + "author": data.get("author", ""), + "by_processor": data.get("by_processor", False), + "metadata": metadata + } + self.data = new_data + new_or_updated = True + + # Write to db if anything changed + if new_or_updated: + timestamp = int(time.time()) + self.timestamp = timestamp + self.write_to_db() + + def get_by_id(self, id): """ Get annotation by ID - :param db: Database connection object :param str name: ID of annotation :return: Annotation object, or `None` for invalid annotation ID """ - data = db.fetchone("SELECT * FROM annotations WHERE id = %s", (id,)) - if not annotation: - return None - else: - return Annotation.get_by_data(db, data) - def get_by_data(db, data): - """ - Instantiate annotation object with given data + try: + int(id) + except ValueError: + raise AnnotationException("Id '%s' is not valid" % id) - :param db: Database handler - :param dict data: Annotation data, should correspond to a database row - :return Annotation: Annotation object - """ - return Annotation(db, data) + return Annotation(id=id) - def set_id_by_data(self, item): + def get_by_field(self, dataset_key, item_id, label): """ - Creates an ID based on the data of the item it has annotated. + Get the annotation information via its dataset key, item ID, and label. + This is always a unique comibination. + :param dataset_key: The key of the dataset this annotation was made for. + :param item_id: The ID of the item this annotation was made for. + :param label: The label of the annotation. + :return data: A dict with data of the retrieved annotation, or None if it doesn't exist. """ + data = self.db.fetchone("SELECT * FROM annotations WHERE dataset = %s AND item_id = %s AND label = %s", + (dataset_key, item_id, label)) + if not data: + return None - return True + data["metadata"] = json.loads(data["metadata"]) + return data - def save(self): + def get_field_id(self, dataset_key, label): """ - Save an annotation to the database. + Sets a `field_id` based on the dataset key and label. + This combination should be unique. + + :param dataset_key: The dataset key + :param label: The label of the dataset. """ - return True + field_id_base = "-".join([dataset_key, label]) + field_id = int.from_bytes(field_id_base.encode(), "little") + self.field_id = field_id + return field_id + + def write_to_db(self): + """ + Write an annotation to the database. + """ + data = self.data + data["metadata"] = json.dumps(data["metadata"]) + return self.db.upsert("annotations", data=data, constraints=["dataset", "label", "item_id"]) @staticmethod def save_many(self, annotations, overwrite=True): @@ -112,53 +195,8 @@ def save_many(self, annotations, overwrite=True): :returns int: How many annotations were saved. """ - - field_keys = {} - annotations_to_delete = set() - - # We're going to add the annotation metadata to the datasets table - # based on the annotations themselves. - annotation_fields = self.get_annotation_fields() - existing_annotations = self.get_annotations() - existing_labels = set(a["label"] for a in existing_annotations) if existing_annotations else [] - - timestamp = time.time() - new_annotations = [] for annotation in annotations: - - # Do some validation; dataset key, post_id, label, and value need to be present. - missing_keys = [] - for mandatory_key in mandatory_keys: - if mandatory_key not in annotation: - missing_keys.append(mandatory_key) - if missing_keys: - raise AnnotationException("Couldn't add annotations; missing field(s) %s" % ",".join(missing_keys)) - - # Add dataset key - annotation["dataset"] = self.key - - # Raise exception if this label is already present for this dataset - # and we're not overwriting - if not overwrite and annotation["label"] in existing_labels: - raise AnnotationException("Couldn't save annotations; label %s already present") - - # If there's no type given, use 'text' - if not annotation.get("type"): - annotation["type"] = "text" - - # If there's no timestamp given, set it to the current time. - if not "timestamp" in annotation: - annotation["timestamp"] = timestamp - annotation["timestamp_created"] = timestamp - - # If not already given, create an ID for this annotation - # based on the label, type, and dataset key. - if "field_id" not in annotation: - field_id_base = "-".join(annotation["dataset"], annotation["label"], annotation.get("type", "")) - field_id = int.from_bytes(field_id_base.encode(), "little") - annotation["field_id"] = field_id - # Add annotation metadata if it is not saved to the datasets table yet. # This is just a simple dict with a field ID, type, label, and possible options. if annotation["field_id"] not in annotation_fields: @@ -214,7 +252,6 @@ def delete_many(self, dataset_key=None, id=None, field_id=None): return self.db.delete("annotations", where) - def __getattr__(self, attr): """ Getter so we don't have to use .data all the time diff --git a/common/lib/dataset.py b/common/lib/dataset.py index 92dcfc625..223c4cd77 100644 --- a/common/lib/dataset.py +++ b/common/lib/dataset.py @@ -1605,6 +1605,20 @@ def has_annotation_fields(self): return True if annotation_fields else False + def make_annotations(self, annotations): + """ + Generates a list of annotation objects from annotation JSON. + :param annotations: A list of dicts or JSON string with annotations + """ + if not annotations: + return None + if isinstance(annotations, str): + annotaitons = json.loads(annotations) + + annotations = [Annotation(annotation, self) for annotation in annotations] + + return annotations + def get_annotation_fields(self): """ Retrieves the saved annotation fields for this dataset. @@ -1822,17 +1836,17 @@ def save_annotations(self, annotations, overwrite=True): if not annotations: return 0 - # Add dataset info to annotations - key = self.key - owner = self.get_owners()[0] - if "dataset" - for i in range(len(annotations)): - if not annotations[i].get("dataset"): - annotations[i]["dataset"] = key - if not annotations[i].get("author"): - annotations[i]["author"] = owner - - return Annotation.save_many(annotations, overwrite=overwrite) + # Add some dataset data to annotations, if not present + for annotation in annotations: + # Set dataset key + if not annotation.get("dataset"): + annotation["dataset"] = self.key + # Set default author to this dataset owner + if not annotation.get("author"): + annotation["author"] = self.get_owners()[0] + + # Create Annotation object, which saves it to the database + Annotation(data=annotation, db=self.db) def delete_annotations(self, dataset_key=None, id=None, field_id=None): """ From 24425e157aebbc1561bd1e091d01c7167e293e15 Mon Sep 17 00:00:00 2001 From: sal-phd-desktop Date: Fri, 9 Aug 2024 17:55:01 +0200 Subject: [PATCH 121/204] General annotations improvements and make processors save annotations --- backend/database.sql | 4 +- backend/lib/processor.py | 55 +++ common/lib/annotation.py | 227 ++++++++--- common/lib/database.py | 2 +- common/lib/dataset.py | 412 +++++++++----------- helper-scripts/migrate/migrate-1.45-1.46.py | 10 +- processors/metrics/count_posts.py | 4 +- 7 files changed, 402 insertions(+), 312 deletions(-) diff --git a/backend/database.sql b/backend/database.sql index c474eb773..01e124eaa 100644 --- a/backend/database.sql +++ b/backend/database.sql @@ -70,9 +70,9 @@ CREATE UNIQUE INDEX datasets_owners_user_key_idx ON datasets_owners("name" text_ -- annotations CREATE TABLE IF NOT EXISTS annotations ( id SERIAL PRIMARY KEY, - item_id TEXT, - field_id TEXT, dataset TEXT, + field_id TEXT, + item_id TEXT, timestamp INT DEFAULT 0, timestamp_created INT DEFAULT 0, label TEXT, diff --git a/backend/lib/processor.py b/backend/lib/processor.py index 24b7b4a11..5a0aaff52 100644 --- a/backend/lib/processor.py +++ b/backend/lib/processor.py @@ -712,6 +712,59 @@ def create_standalone(self): return standalone + def write_annotations(self, annotations: list, source_dataset=None, overwrite=False) -> int: + """ + Saves annotations made by this processor on the basis of another dataset. + Also adds some data regarding this processor: set `author` and `label` to processor name, + and add parameters to `metadata` (unless explicitly indicated). + + :param annotations: List of dictionaries with annotation items. Must have `item_id` and `value`. + E.g. [{"item_id": "12345", "label": "Valid", "value": "Yes"}] + :param source_dataset: The dataset that these annotations were based on. + Defaults to the parent dataset. + :param bool overwrite: Whether to overwrite annotations if the label is already present + for the dataset. If this is False and the label is already present, + we'll add a number to the label to differentiate it (e.g. `count-posts1`). + Else we'll just replace the old data. + + :returns int: How many annotations were saved. + + """ + + if not annotations: + return 0 + + # Default to parent dataset + if not source_dataset: + source_dataset = self.source_dataset + + # Check if this dataset already has annotation fields + existing_labels = source_dataset.get_annotation_field_labels() + + # Set some values + for annotation in annotations: + + # Set the default author and label to this processor's name + if not annotation.get("label"): + # If the processor has already generated annotation fields, + # add a number to differentiate the label + label = self.name + if not overwrite and label in existing_labels: + label += "-" + str(len([l for l in existing_labels if l.startswith(label)])) + annotation["label"] = label + if not annotation.get("author"): + annotation["author"] = self.name + + annotation["by_processor"] = True + + # Add processor parameters to annotation metadata + if not annotation.get("metadata"): + annotation["metadata"] = {} + annotation["metadata"]["processor-parameters"] = self.parameters + + annotations_saved = source_dataset.save_annotations(annotations, overwrite=overwrite) + return annotations_saved + @classmethod def map_item_method_available(cls, dataset): """ @@ -847,6 +900,8 @@ def get_extension(self, parent_dataset=None): # A non filter processor updated the base Processor extension to None/False? return None + + @classmethod def is_rankable(cls, multiple_items=True): """ diff --git a/common/lib/annotation.py b/common/lib/annotation.py index 8cda9d0dd..b871006a9 100644 --- a/common/lib/annotation.py +++ b/common/lib/annotation.py @@ -5,6 +5,7 @@ import time import json +import hashlib from common.lib.exceptions import AnnotationException @@ -49,7 +50,7 @@ def __init__(self, data=None, id=None, db=None): # Must have an ID or data if id is None and (data is None or not isinstance(data, dict)): - raise AnnotationException("Annotation() requires either a `data` dictionary or ID.") + raise AnnotationException("Annotation() requires either a valid `data` dictionary or ID.") if not db: raise AnnotationException("Annotation() needs a `db` database object") @@ -69,7 +70,7 @@ def __init__(self, data=None, id=None, db=None): current = self.db.fetchone("SELECT * FROM annotations WHERE id = %s" % (self.id)) if not current: raise AnnotationException( - "Annotation() requires a valid ID for its 'id' argument, %s given" % id) + "Annotation() requires a valid ID for an existing annotation, %s given" % id) # If an ID is not given, get or create an Annotation object from its data. # First check if required fields are present in `data`. @@ -78,7 +79,7 @@ def __init__(self, data=None, id=None, db=None): if required_field not in data or not data[required_field]: raise AnnotationException("Annotation() requires a %s field" % required_field) - # Check if this annotation already exists, based on the data + # Check if this annotation already exists, based on dataset key, item id, and label. current = self.get_by_field(data["dataset"], data["item_id"], data["label"]) # If we were able to retrieve an annotation from the db, it already exists @@ -90,7 +91,7 @@ def __init__(self, data=None, id=None, db=None): if key not in current: current["metadata"][key] = value new_or_updated = True - # Else update the value + # If values differ, update the value elif current[key] != value: current[key] = value new_or_updated = True @@ -99,39 +100,39 @@ def __init__(self, data=None, id=None, db=None): # If this is a new annotation, set all the properties. else: + # Keep track of when the annotation was made created_timestamp = int(time.time()) - # Store unknown properties in `metadata` - metadata = {k: v for k, v in data.items() if k not in self.__dict__} - print(self.__dict__) - print(metadata) + new_data = { "item_id": data["item_id"], "field_id": data["field_id"] if data.get("field_id") else self.get_field_id(data["dataset"], data["label"]), "dataset": data["dataset"], - "timestamp_created": timestamp, + "timestamp": 0, + "timestamp_created": created_timestamp, "label": data["label"], "type": data.get("type", "text"), "options": data.get("options", ""), "value": data.get("value", ""), "author": data.get("author", ""), "by_processor": data.get("by_processor", False), - "metadata": metadata + "metadata": data.get("metadata", {}), } + self.data = new_data new_or_updated = True # Write to db if anything changed if new_or_updated: - timestamp = int(time.time()) - self.timestamp = timestamp + self.data["timestamp"] = int(time.time()) + print(self.data) self.write_to_db() - def get_by_id(self, id): + def get_by_id(self, id: int): """ Get annotation by ID - :param str name: ID of annotation + :param str id: ID of annotation :return: Annotation object, or `None` for invalid annotation ID """ @@ -142,10 +143,10 @@ def get_by_id(self, id): return Annotation(id=id) - def get_by_field(self, dataset_key, item_id, label): + def get_by_field(self, dataset_key: str, item_id: str, label: str) -> dict: """ Get the annotation information via its dataset key, item ID, and label. - This is always a unique comibination. + This is always a unique combination. :param dataset_key: The key of the dataset this annotation was made for. :param item_id: The ID of the item this annotation was made for. @@ -157,12 +158,12 @@ def get_by_field(self, dataset_key, item_id, label): data = self.db.fetchone("SELECT * FROM annotations WHERE dataset = %s AND item_id = %s AND label = %s", (dataset_key, item_id, label)) if not data: - return None + return {} data["metadata"] = json.loads(data["metadata"]) return data - def get_field_id(self, dataset_key, label): + def get_field_id(self, dataset_key: str, label: str) -> str: """ Sets a `field_id` based on the dataset key and label. This combination should be unique. @@ -170,8 +171,7 @@ def get_field_id(self, dataset_key, label): :param dataset_key: The dataset key :param label: The label of the dataset. """ - field_id_base = "-".join([dataset_key, label]) - field_id = int.from_bytes(field_id_base.encode(), "little") + field_id = hashlib.md5(dataset_key + label.encode("utf-8")).hexdigest() self.field_id = field_id return field_id @@ -179,48 +179,10 @@ def write_to_db(self): """ Write an annotation to the database. """ - data = self.data - data["metadata"] = json.dumps(data["metadata"]) - return self.db.upsert("annotations", data=data, constraints=["dataset", "label", "item_id"]) - - @staticmethod - def save_many(self, annotations, overwrite=True): - """ - Takes a list of annotations and saves them to the annotations table. - If a field is not yet present in the datasets table, it also adds it there. - - :param bool overwrite: Whether to overwrite annotation if the label is already present - for the dataset. - - :returns int: How many annotations were saved. - - """ - new_annotations = [] - for annotation in annotations: - # Add annotation metadata if it is not saved to the datasets table yet. - # This is just a simple dict with a field ID, type, label, and possible options. - if annotation["field_id"] not in annotation_fields: - annotation_fields[annotation["field_id"]] = { - "label": annotation["label"], - "type": annotation["type"] - } - if "options" in annotation: - annotation_fields[annotation["field_id"]]["options"] = annotation["options"] - - new_annotations.append(annotation) - - # Save annotation fields if they're not present yet. - if annotation_fields != self.get_annotation_fields(): - self.save_annotation_fields(annotation_fields) - - # If there's nothing to save or delete, do nothing - if not new_annotations: - return 0 - - # Overwrite old annotations with upsert. Else add. - self.db.upsert("annotations", new_annotations, constraints=["dataset", "post_id", "label"]) - - return len(new_annotations) + db_data = self.data + m = db_data["metadata"] # To avoid circular reference error + db_data["metadata"] = json.dumps(m) + return self.db.upsert("annotations", data=db_data, constraints=["label", "dataset", "item_id"]) def delete(self): """ @@ -229,10 +191,11 @@ def delete(self): return self.db.delete("annotations", {"id": self.id}) @staticmethod - def delete_many(self, dataset_key=None, id=None, field_id=None): + def delete_many(db, dataset_key=None, id=None, field_id=None): """ Deletes annotations for an entire dataset or by a list of (field) IDs. + :param db: Database object. :param str dataset_key: A dataset key. :param li id: A list or string of unique annotation IDs. :param li field_id: A list or string of IDs for annotation fields. @@ -250,7 +213,143 @@ def delete_many(self, dataset_key=None, id=None, field_id=None): if field_id: where["field_id"] = field_id - return self.db.delete("annotations", where) + return db.delete("annotations", where) + + @staticmethod + def update_annotations_via_fields(dataset_key, old_fields: dict, new_fields: dict, db) -> int: + """ + Updates annotations in the annotations table if the input fields + themselves have been changed, for instance if a dropdown label is renamed + or a field is deleted. + + :param str dataset_key: The dataset key for which fields changed. + :param dict old_fields: Old annotation fields. + :param dict new_fields: New annotation fields; this should contain not just + the additions, but all fields, changed or otherwise. + :param db: Database object so we can write. + + :returns int: How many records were affected. + """ + + text_fields = ["textarea", "text"] + + # If old and new fields are identical, do nothing. + if old_fields == new_fields: + return 0 + + fields_to_delete = set() # Delete all annotations with this field ID + fields_to_update = {} # Update values of annotations with this field ID + + # Loop through the old annotation fields + for old_field_id, old_field in old_fields.items(): + + # Delete all annotations of this type if the field is deleted. + if old_field_id not in new_fields: + fields_to_delete.add(old_field_id) + continue + + field_id = old_field_id + new_field = new_fields[field_id] + + # If the annotation type has changed, also delete existing annotations, + # except between text and textarea, where we can just change the type and keep the text. + if old_field["type"] != new_field["type"]: + if not old_field["type"] in text_fields and not new_field["type"] in text_fields: + fields_to_delete.add(field_id) + continue + + # Loop through all the key/values in the new field settings + # and update in case it's different from the old values. + update_data = {} + for field_key, field_value in new_field.items(): + + # Update if values don't match + if field_value != old_field.get(field_key): + + # Special case: option values that are removed/renamed. + # Here we may have to change/delete values within the + # values column. + if field_key == "options": + + new_options = field_value + + # Edge case: delete annotations of this type if all option fields are deleted + if not new_options: + fields_to_delete.add(field_id) + continue + + old_options = old_field["options"] + options_to_update = {} + + # Options are saved in a dict with IDs as keys and labels as values. + for old_option_id, old_option in old_options.items(): + # Renamed option label + if old_option_id in new_options and old_option != new_options[old_option_id]: + options_to_update[old_option] = new_options[old_option_id] # Old label -> new label + # Deleted option + elif old_option_id not in new_options: + options_to_update[old_option] = None # Remove None labels later + + if options_to_update: + update_data[field_key] = {"options": options_to_update} + + # For all other changes, just overwrite with new data. + else: + update_data[field_key] = field_value + + if update_data: + fields_to_update[field_id] = update_data + + # Delete annotations + if fields_to_delete: + Annotation.delete_many(db, field_id=list(fields_to_delete)) + + # Write changes to fields to database + count = 0 + if fields_to_update: + for field_id, updates in fields_to_update.items(): + + # Write to db + for column, update_value in updates.items(): + + # Change values of columns + updates = db.update("annotations", {column: update_value}, + where={"dataset": dataset_key, "field_id": field_id}) + count += updates + + # Special case: Changed options. + # Here we have to also rename/remove inserted options from the values column. + if column == "options": + + inserted_options = db.fetchall("SELECT id, value FROM annotations " + "WHERE dataset = %s and field_id = %s" % (dataset_key, field_id)) + new_inserts = [] + for inserted_option in inserted_options: + + annotation_id = inserted_option["id"] + inserted_option = inserted_option["value"] + + if not inserted_option: + continue + + # Remove or rename options + new_values = [] + for inserted_option in inserted_options: + if inserted_option in update_value: + if update_value[inserted_option] == None: + # Don't add + continue + elif inserted_option in update_value: + # Replace with new value + new_values.append(update_value[inserted_option]) + else: + # Keep old value + new_values.append(inserted_option) + + new_values = ",".join(new_values) + db.update("annotations", {"value": new_values}, where={"id": annotation_id}) + + return count def __getattr__(self, attr): """ diff --git a/common/lib/database.py b/common/lib/database.py index 9166dab4f..eb69a0d2f 100644 --- a/common/lib/database.py +++ b/common/lib/database.py @@ -105,8 +105,8 @@ def update(self, table, data, where=None, commit=True): Update a database record :param string table: Table to update - :param dict where: Simple conditions, parsed as "column1 = value1 AND column2 = value2" etc :param dict data: Data to set, Column => Value + :param dict where: Simple conditions, parsed as "column1 = value1 AND column2 = value2" etc :param bool commit: Whether to commit after executing the query :return int: Number of affected rows. Note that this may be unreliable if `commit` is `False` diff --git a/common/lib/dataset.py b/common/lib/dataset.py index 223c4cd77..c377f37b5 100644 --- a/common/lib/dataset.py +++ b/common/lib/dataset.py @@ -334,14 +334,10 @@ def iterate_items(self, processor=None, warn_unmappable=True, map_missing="defau if own_processor and own_processor.map_item_method_available(dataset=self): item_mapper = True - # Annotations and annotation fields are dynamically added to top-level dataset - # and we're handling as 'extra' map_item fields. - annotation_fields = None - annotations = None - if self.is_top_dataset(): - annotation_fields = self.get_annotation_fields() - if annotation_fields: - annotations = self.get_annotations() + # Annotations are dynamically added + # and we're handling them as 'extra' map_item fields. + has_annotations = self.has_annotations() + annotation_labels = self.get_annotation_field_labels() # missing field strategy can be for all fields at once, or per field # if it is per field, it is a dictionary with field names and their strategy @@ -387,27 +383,30 @@ def iterate_items(self, processor=None, warn_unmappable=True, map_missing="defau mapped_item = original_item # Add possible annotations - if annotation_fields: - for annotation_field in annotation_fields.values(): + if has_annotations: + + # Get the annotation, if available. + post_annotations = self.get_annotations(item_ids=[mapped_item.data["id"]]) + + # We're always handling annotated data as a MappedItem object, + # even if no map_item() function is available for the data source. + if not isinstance(mapped_item, MappedItem): + mapped_item = MappedItem(mapped_item) + + for annotation_label in annotation_labels: annotation = "" - annotation_label = annotation_field["label"] - - # Get the annotation, if available. - # We're always handling annotated data as a MappedItem object, - # even if no map_item() function is available for the data source. - if not isinstance(mapped_item, MappedItem): - mapped_item = MappedItem(mapped_item) - - if annotations and mapped_item.data.get("id", "") in annotations: - annotation = annotations[mapped_item.data["id"]].get(annotation_label, "") - if isinstance(annotation, list): - annotation = ",".join(annotation) + + for post_annotation in post_annotations: + if post_annotation.label == annotation_label: + annotation = post_annotation.value + if isinstance(annotation, list): + annotation = ",".join(annotation) # We're always adding an annotation value, # as an empty string if it's absent. mapped_item.data[annotation_label] = annotation - + # yield a DatasetItem, which is a dict with some special properties yield DatasetItem(mapper=item_mapper, original=original_item, mapped_object=mapped_item, **(mapped_item.get_item_data() if type(mapped_item) is MappedItem else mapped_item)) @@ -658,7 +657,7 @@ def get_owners_users(self, role="owner"): # owners that are owner by being part of a tag owners.extend(itertools.chain(*[tagged_owners for tag, tagged_owners in self.tagged_owners.items() if - role is None or self.owners[f"tag:{tag}"]["role"] == role])) + role is None or self.owners[f"tag:{tag}"]["role"] == role])) # de-duplicate before returning return set(owners) @@ -1519,7 +1518,7 @@ def file_exists(self): if self.get_results_path().exists(): return True - + return False def get_extension(self): @@ -1545,7 +1544,7 @@ def get_result_url(self): """ filename = self.get_results_path().name url_to_file = ('https://' if config.get("flask.https") else 'http://') + \ - config.get("flask.server_name") + '/result/' + filename + config.get("flask.server_name") + '/result/' + filename return url_to_file def warn_unmappable_item(self, item_count, processor=None, error_message=None, warn_admins=True): @@ -1572,31 +1571,40 @@ def warn_unmappable_item(self, item_count, processor=None, error_message=None, w # No other log available raise DataSetException(f"Unable to map item {item_count} for dataset {closest_dataset.key} and properly warn") - # Annotation features - def get_annotations(self): + # Annotation functions (most of it is handled in Annotations) + def has_annotations(self) -> bool: + """ + Whether this dataset has annotations + """ + + annotation = self.db.fetchone("SELECT * FROM annotations WHERE dataset = %s;", (self.key,)) + + return True if annotation else False + + def get_annotations(self, item_ids=[]) -> list: """ Retrieves the annotations for this dataset. + + :param item_ids: A list of item IDs to get the annotations from. + If empty, get all the annotations for this dataset. + return list: All annotations, each in their own dictionary. """ - annotations = self.db.fetchall("SELECT * FROM annotations WHERE dataset = %s;", (self.key,)) + + if item_ids: + annotations = self.db.fetchall("SELECT * FROM annotations " + "WHERE dataset = %s AND item_id IN %s;", (self.key, tuple(item_ids))) + else: + annotations = self.db.fetchall("SELECT * FROM annotations WHERE dataset = %s;", (self.key,)) if not annotations: - annotations = None + annotations = [] else: - annotations = [Annotation(data=annotation, dataset=self) for annotation in annotations] + annotations = [Annotation(data=annotation, db=self.db) for annotation in annotations] return annotations - def has_annotations(self): - """ - Returns True if there's one or more annotations found - """ - - annotation = self.db.fetchone("SELECT * FROM annotations WHERE dataset = %s;", (self.key,)) - - return True if annotation else False - - def has_annotation_fields(self): + def has_annotation_fields(self) -> bool: """ Returns True if there's annotation fields saved tot the dataset table """ @@ -1605,21 +1613,7 @@ def has_annotation_fields(self): return True if annotation_fields else False - def make_annotations(self, annotations): - """ - Generates a list of annotation objects from annotation JSON. - :param annotations: A list of dicts or JSON string with annotations - """ - if not annotations: - return None - if isinstance(annotations, str): - annotaitons = json.loads(annotations) - - annotations = [Annotation(annotation, self) for annotation in annotations] - - return annotations - - def get_annotation_fields(self): + def get_annotation_fields(self) -> dict: """ Retrieves the saved annotation fields for this dataset. These are stored in the annotations table. @@ -1632,200 +1626,35 @@ def get_annotation_fields(self): if annotation_fields and annotation_fields.get("annotation_fields"): annotation_fields = json.loads(annotation_fields["annotation_fields"]) else: - annotation_fields = None + annotation_fields = {} return annotation_fields - def save_annotation_fields(self, new_fields, add=False): + def get_annotation_field_labels(self) -> list: """ - Save annotation field data to the datasets table (in the `annotation_fields` column). - If changes to the annotation fields affect existing annotations, - this function will also call `update_annotations_via_fields()` to change them. - - :param dict new_fields: New annotation fields, with a field ID as key. - - :param bool add: Whether we're merely adding new fields - or replacing the whole batch. If add is false, - `new_fields` should contain all fields. - - :return int: The number of annotation fields saved. - - """ - - # Get existing annotation fields to see if stuff changed. - old_fields = self.get_annotation_fields() - changes = False - - # Do some validation - # Annotation field must be valid JSON. - try: - json.dumps(new_fields) - except ValueError: - raise AnnotationException("Can't save annotation fields: not valid JSON (%s)" % new_fields) - - # Annotation fields must at minimum have `type` and `label` keys. - for field_id, annotation_field in new_fields.items(): - if not isinstance(field_id, str): - raise AnnotationException("Can't save annotation fields: field ID %s is not a valid string" % field_id) - if not "label" in annotation_field: - raise AnnotationException("Can't save annotation fields: all fields must have a label" % field_id) - if not "type" in annotation_field: - raise AnnotationException("Can't save annotation fields: all fields must have a type" % field_id) - - # Keep track of whether existing fields have changed; if so, we're going to - # update the annotations table. - if field_id in old_fields: - if old_fields[field_id] != annotation_field: - changes = True - - # If we're just adding fields, add them to the old fields - # If the field already exists, overwrite the old field. - if add and old_fields: - all_fields = old_fields - for field_id, annotation_field in new_fields.items(): - all_fields[field_id] = annotation_field - new_fields = all_fields - - # We're saving the new annotation fields as-is. - # Ordering of fields is preserved this way. - self.db.execute("UPDATE datasets SET annotation_fields = %s WHERE key = %s;", (json.dumps(new_fields), self.key)) - - # If we're adding but the field already exists, update/delete annotations with that ID. - add_and_overlap = add and any([True for k in list(new_fields.keys()) if k in old_fields]) - - if changes or add_and_overlap: - self.update_annotations_via_fields(old_fields, new_fields) - - return len(new_fields) + Retrieves the saved annotation field labels for this dataset. + These are stored in the annotations table. - def update_annotations_via_fields(self, old_fields, new_fields): + :return list: List of annotation field labels. """ - Updates annotations in the annotations table if the input field - itself has been changed, for instance if a dropdown label is renamed. - - :param di old_fields: Old annotation fields - :param di new_fields: New annotation fields; this should contain not just - additions, but all fields, changed or otherwise. - - """ - - text_fields = ["textarea", "text"] - - # If old and new fields are identical, do nothing. - if old_fields == new_fields: - return - - # Only update annotations if they, in fact, exist. - annotations = self.get_annotations() - if not annotations: - return - - fields_to_delete = set() # Delete all annotations with this field ID - fields_to_update = {} # Update values of annotations with this field ID - # Loop through the old annotation fields - for old_field_id, old_field in old_fields.items(): + annotation_fields = self.get_annotation_fields() - # Delete all annotations of this type if the field is deleted. - if old_field_id not in new_fields: - fields_to_delete.add(old_field_id) - continue + if not annotation_fields: + return [] - field_id = old_field_id - new_field = new_fields[field_id] + labels = [v["label"] for v in annotation_fields.values()] - # If the annotation type has changed, also delete existing annotations, - # except between text and textarea, where we can just change the type and keep the text. - if old_field["type"] != new_field["type"]: - if not old_field["type"] in text_fields and not new_field["type"] in text_fields: - fields_to_delete.add(field_id) - continue + return labels - # Loop through all the key/values in the new field data - # and update in case it's different from the old values. - update_data = {} - for field_key, field_value in new_field.items(): - - # Update if values don't match - if field_value != old_field.get(field_key): - - # Special case: option values that are removed/renamed. - # Here we only have to change specific values within the - # values column. - if field_key == "options": - - new_options = field_value - # Delete annotations of this type if all option fields are deleted - # (even though this should not be possible in the Explorer front-end) - if not new_options: - fields_to_delete.add(field_id) - continue - - old_options = old_field["options"] - - options_to_update = {} - - # Options are saved in a dict with IDs and labels as keys/values. - for old_option_id, old_option in old_options.items(): - # Renamed option label - if old_option_id in new_options and old_option != new_options[old_option_id]: - options_to_update[old_option] = new_options[old_option_id] # Old label -> new label - # Deleted option - elif old_option_id not in new_options: - options_to_update[old_option] = None # Remove None labels - - if options_to_update: - update_data[field_key] = {} - update_data[field_key]["options"] = options_to_update - - # For all other changes, just overwrite with new data. - else: - update_data[field_key] = field_value - - if update_data: - fields_to_update[field_id] = update_data - - # Delete annotations - if fields_to_delete: - self.delete_annotations(field_id=list(fields_to_delete)) - - # Change annotations based on changes in update fields - if fields_to_update: - new_annotations = [] - for annotation in annotations: - if annotation.field_id in fields_to_update: - for k, update_field in fields_to_update[annotation["field_id"]]: - - # Special case: Changed options - if k == "options": - new_values = [] - for inserted_option in annotations["value"].split(","): - if inserted_option in update_field: - if update_field[inserted_option] == None: - # Don't add - continue - elif inserted_option in update_field: - # Replace with new value - new_values.append(update_field[inserted_option]) - else: - # Keep old value - new_values.append(inserted_option) - - update_field = new_values - - annotation[k] = update_field - - new_annotations.append(annotation) - - # Save updated annotations - self.save_annotations(new_annotations) - - def save_annotations(self, annotations, overwrite=True): + def save_annotations(self, annotations: list, overwrite=True) -> int: """ Takes a list of annotations and saves them to the annotations table. - If a field is not yet present in the datasets table, it also adds it there. + If a field is not yet present in the `annotation_fields` column in + the datasets table, it also adds it there. - :param list annotations: List of dictionaries with annotation items. + :param list annotations: List of dictionaries with annotation items. Must have `item_id` and `label`. + E.g. [{"item_id": "12345", "label": "Valid", "value": "Yes"}] :param bool overwrite: Whether to overwrite annotation if the label is already present for the dataset. @@ -1836,17 +1665,63 @@ def save_annotations(self, annotations, overwrite=True): if not annotations: return 0 + count = 0 + annotation_fields = self.get_annotation_fields() + annotation_labels = self.get_annotation_field_labels() + known_field_ids = {} # Just so we don't have to hash every annotation without a field ID + # Add some dataset data to annotations, if not present for annotation in annotations: + + # Check if the required fields are present + if "item_id" not in annotation: + raise AnnotationException("Can't save annotations; annotation must have an `item_id` referencing " + "the item they annotated, got %s" % annotation) + if "label" not in annotation or not isinstance(annotation["label"], str): + raise AnnotationException("Can't save annotations; annotation must have a `label` field, " + "got %s" % annotation) + if not overwrite and annotation["label"] in annotation_labels: + raise AnnotationException("Can't save annotations; annotation field with label %s " + "already exists" % annotation["label"]) + # Set dataset key if not annotation.get("dataset"): annotation["dataset"] = self.key + + # If not present, add an ID for this annotation field, based on the dataset key and label + if "field_id" not in annotation: + field_id_str = annotation["label"] + annotation["dataset"] + # Check if we hashed this before + if field_id_str in known_field_ids: + field_id = known_field_ids[field_id_str] + else: + field_id = hashlib.md5(field_id_str.encode("utf-8")).hexdigest() + annotation["field_id"] = field_id + # Set default author to this dataset owner + # If this annotation is made by a processor, it will have the processor name if not annotation.get("author"): annotation["author"] = self.get_owners()[0] - # Create Annotation object, which saves it to the database + # Add data on the type of annotation field, if it is not saved to the datasets table yet. + # For now this is just a simple dict with a field ID, type, label, and possible options. + if not annotation_fields or annotation["field_id"] not in annotation_fields: + annotation_fields[annotation["field_id"]] = { + "label": annotation["label"], + "type": annotation.get("type", "text") # Default to text + } + if "options" in annotation: + annotation_fields[annotation["field_id"]]["options"] = annotation["options"] + + # Create Annotation object, which also saves it to the database Annotation(data=annotation, db=self.db) + count += 1 + + # Save annotation fields if things changed + if annotation_fields != self.get_annotation_fields(): + self.save_annotation_fields(annotation_fields) + + return count def delete_annotations(self, dataset_key=None, id=None, field_id=None): """ @@ -1857,7 +1732,7 @@ def delete_annotations(self, dataset_key=None, id=None, field_id=None): :param li field_id: A list or string of IDs for annotation fields. :return int: The number of removed records. - """ + """ if not dataset_key and not id and not field_id: return 0 @@ -1872,6 +1747,67 @@ def delete_annotations(self, dataset_key=None, id=None, field_id=None): return self.db.delete("annotations", where) + def save_annotation_fields(self, new_fields: dict, add=False) -> int: + """ + Save annotation field data to the datasets table (in the `annotation_fields` column). + If changes to the annotation fields affect existing annotations, + this function will also call `update_annotations_via_fields()` to change them. + + :param dict new_fields: New annotation fields, with a field ID as key. + + :param bool add: Whether we're merely adding new fields + or replacing the whole batch. If add is False, + `new_fields` should contain all fields. + + :return int: The number of annotation fields saved. + + """ + + # Get existing annotation fields to see if stuff changed. + old_fields = self.get_annotation_fields() + changes = False + + # Do some validation + # Annotation field must be valid JSON. + try: + json.dumps(new_fields) + except ValueError: + raise AnnotationException("Can't save annotation fields: not valid JSON (%s)" % new_fields) + + # Annotation fields must at minimum have `type` and `label` keys. + for field_id, annotation_field in new_fields.items(): + if not isinstance(field_id, str): + raise AnnotationException("Can't save annotation fields: field ID %s is not a valid string" % field_id) + if "label" not in annotation_field: + raise AnnotationException("Can't save annotation fields: all fields must have a label" % field_id) + if "type" not in annotation_field: + raise AnnotationException("Can't save annotation fields: all fields must have a type" % field_id) + + # Keep track of whether existing fields have changed; if so, we're going to + # update the annotations table. + if field_id in old_fields: + if old_fields[field_id] != annotation_field: + changes = True + + # If we're just adding fields, add them to the old fields. + # If the field already exists, overwrite the old field. + if add and old_fields: + all_fields = old_fields + for field_id, annotation_field in new_fields.items(): + all_fields[field_id] = annotation_field + new_fields = all_fields + + # We're saving the new annotation fields as-is. + # Ordering of fields is preserved this way. + self.db.execute("UPDATE datasets SET annotation_fields = %s WHERE key = %s;", (json.dumps(new_fields), self.key)) + + # If anything changed with the annotation fields, possibly update + # existing annotations (e.g. to delete them or change their labels). + if changes: + Annotation.update_annotations_via_fields(self.key, old_fields, new_fields, self.db) + + return len(new_fields) + def __getattr__(self, attr): """ Getter so we don't have to use .data all the time diff --git a/helper-scripts/migrate/migrate-1.45-1.46.py b/helper-scripts/migrate/migrate-1.45-1.46.py index e0248cfcf..c127de266 100644 --- a/helper-scripts/migrate/migrate-1.45-1.46.py +++ b/helper-scripts/migrate/migrate-1.45-1.46.py @@ -37,8 +37,8 @@ for field_id, annotation_field in annotation_fields.items(): if "options" in annotation_field: - flattened_options = {} + if isinstance(annotation_field["options"], list): for op in annotation_field["options"]: flattened_options.update(op) @@ -55,9 +55,9 @@ db.execute(""" CREATE TABLE IF NOT EXISTS annotations_new ( id SERIAL PRIMARY KEY, - field_id SERIAL, - post_id TEXT, dataset TEXT, + field_id SERIAL, + item_id TEXT, timestamp INT DEFAULT 0, timestamp_created INT DEFAULT 0, label TEXT, @@ -80,7 +80,7 @@ ON annotations_new ( label, dataset, - post_id + item_id ); CREATE INDEX IF NOT EXISTS annotation_value ON annotations_new ( @@ -104,7 +104,7 @@ count = 0 skipped_count = 0 - columns = "post_id,field_id,dataset,timestamp,timestamp_created,label,type,options,value,author,by_processor,metadata" + columns = "dataset,field_id,item_id,timestamp,timestamp_created,label,type,options,value,author,by_processor,metadata" # Each row are **all** annotations per dataset for row in annotations: diff --git a/processors/metrics/count_posts.py b/processors/metrics/count_posts.py index ea1ef48f5..af32ed565 100644 --- a/processors/metrics/count_posts.py +++ b/processors/metrics/count_posts.py @@ -3,7 +3,6 @@ """ from common.lib.helpers import UserInput, pad_interval, get_interval_descriptor -from common.lib.annotation import Annotation from backend.lib.processor import BasicProcessor __author__ = "Stijn Peeters" @@ -60,7 +59,7 @@ def process(self): for post in self.source_dataset.iterate_items(self): - annotation = Annotation(value="test", label="count_posts_test", dataset=self.source_dataset) + annotation = {"value": "test", "item_id": post["id"]} annotations.append(annotation) try: @@ -153,6 +152,7 @@ def process(self): row["value_relative"] = intervals[interval]["relative"] rows.append(row) + self.write_annotations(annotations) self.write_csv_items_and_finish(rows) @classmethod From 77213410f2a3ce130ceca3d0923bc7137159c5e2 Mon Sep 17 00:00:00 2001 From: Sal Hagen Date: Mon, 12 Aug 2024 13:08:57 +0200 Subject: [PATCH 122/204] fix: Bug in migrate --- helper-scripts/migrate/migrate-1.45-1.46.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/helper-scripts/migrate/migrate-1.45-1.46.py b/helper-scripts/migrate/migrate-1.45-1.46.py index c127de266..5e8f0d979 100644 --- a/helper-scripts/migrate/migrate-1.45-1.46.py +++ b/helper-scripts/migrate/migrate-1.45-1.46.py @@ -110,7 +110,6 @@ for row in annotations: dataset = db.fetchone("SELECT * FROM datasets WHERE key = '" + row["dataset"] + "';") - # If the dataset is not present anymore, # we're going to skip these annotations; # likely the dataset is expired. @@ -119,7 +118,11 @@ skipped_count += 1 continue - annotation_fields = json.loads(dataset["annotation_fields"]) + annotation_fields = dataset["annotation_fields"] + if annotation_fields: + annotation_fields = json.loads(dataset.get("annotation_fields")) + else: annotation_fields = {} + author = dataset.get("creator", "") if not row.get("annotations"): From 22f6ea2fbea33aa1eeb7c76564d2a4290791fd4d Mon Sep 17 00:00:00 2001 From: Sal Hagen Date: Mon, 12 Aug 2024 15:45:03 +0200 Subject: [PATCH 123/204] Fixes in migrate script --- helper-scripts/migrate/migrate-1.45-1.46.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/helper-scripts/migrate/migrate-1.45-1.46.py b/helper-scripts/migrate/migrate-1.45-1.46.py index 5e8f0d979..d45523716 100644 --- a/helper-scripts/migrate/migrate-1.45-1.46.py +++ b/helper-scripts/migrate/migrate-1.45-1.46.py @@ -56,7 +56,7 @@ CREATE TABLE IF NOT EXISTS annotations_new ( id SERIAL PRIMARY KEY, dataset TEXT, - field_id SERIAL, + field_id TEXT, item_id TEXT, timestamp INT DEFAULT 0, timestamp_created INT DEFAULT 0, @@ -104,7 +104,7 @@ count = 0 skipped_count = 0 - columns = "dataset,field_id,item_id,timestamp,timestamp_created,label,type,options,value,author,by_processor,metadata" + columns = "id,dataset,field_id,item_id,timestamp,timestamp_created,label,type,options,value,author,by_processor,metadata" # Each row are **all** annotations per dataset for row in annotations: @@ -156,9 +156,9 @@ value = ",".join(value) inserts = [( - str(post_id), # post_id; needs to be a string, changes per data source. + row["dataset"], # dataset int(field_id), # field_id; this is an ID for the same type of input field. - row["dataset"], # dataset + str(post_id), # post_id; needs to be a string, changes per data source. dataset["timestamp"], # timestamp dataset["timestamp"], # timestamp_created label, # label From e5cce4905894557ed018549aa0d64f0b56022909 Mon Sep 17 00:00:00 2001 From: Sal Hagen Date: Mon, 12 Aug 2024 15:45:23 +0200 Subject: [PATCH 124/204] Improve Annotation() and make map_item() fetch annotation values --- common/lib/annotation.py | 23 +++++++++++-------- common/lib/dataset.py | 49 ++++++++++++++++++++++------------------ 2 files changed, 41 insertions(+), 31 deletions(-) diff --git a/common/lib/annotation.py b/common/lib/annotation.py index b871006a9..7da8a01a8 100644 --- a/common/lib/annotation.py +++ b/common/lib/annotation.py @@ -23,6 +23,7 @@ class Annotation: data = None db = None + id = None # Unique ID for this annotation item_id = None # ID of the item for this annotation, e.g. post ID field_id = None # If of this type of annotation field for this dataset @@ -30,6 +31,7 @@ class Annotation: timestamp = None # When this annotation was edited timestamp_created = None # When this timestamp was created label = None # Label of annotation + type = None # Type of annotation (e.g. `text`) options = None # Possible options value = None # The actual annotation value author = None # Who made the annotation @@ -64,7 +66,7 @@ def __init__(self, data=None, id=None, db=None): # an ID, it is guaranteed to be in the database. # IDs can both be explicitly given or present in the data dict. if id is not None or "id" in data: - if "id" in data: + if data and "id" in data: id = data["id"] self.id = id # IDs correspond to unique serial numbers in the database. current = self.db.fetchone("SELECT * FROM annotations WHERE id = %s" % (self.id)) @@ -105,9 +107,9 @@ def __init__(self, data=None, id=None, db=None): created_timestamp = int(time.time()) new_data = { + "dataset": data["dataset"], "item_id": data["item_id"], "field_id": data["field_id"] if data.get("field_id") else self.get_field_id(data["dataset"], data["label"]), - "dataset": data["dataset"], "timestamp": 0, "timestamp_created": created_timestamp, "label": data["label"], @@ -122,17 +124,20 @@ def __init__(self, data=None, id=None, db=None): self.data = new_data new_or_updated = True + for k, v in self.data.items(): + self.__setattr__(k, v) + # Write to db if anything changed if new_or_updated: - self.data["timestamp"] = int(time.time()) - print(self.data) + self.timestamp = int(time.time()) self.write_to_db() - def get_by_id(self, id: int): + def get_by_id(id: int, db): """ Get annotation by ID :param str id: ID of annotation + :param db: Database connection object :return: Annotation object, or `None` for invalid annotation ID """ @@ -141,7 +146,7 @@ def get_by_id(self, id: int): except ValueError: raise AnnotationException("Id '%s' is not valid" % id) - return Annotation(id=id) + return Annotation(id=id, db=db) def get_by_field(self, dataset_key: str, item_id: str, label: str) -> dict: """ @@ -386,9 +391,9 @@ def __setattr__(self, attr, value): return if attr not in self.data: - self.parameters[attr] = value + self.metadata[attr] = value attr = "metadata" - value = self.parameters + value = self.metadata if attr == "metadata": value = json.dumps(value) @@ -398,4 +403,4 @@ def __setattr__(self, attr, value): self.data[attr] = value if attr == "metadata": - self.parameters = json.loads(value) \ No newline at end of file + self.metadata = json.loads(value) \ No newline at end of file diff --git a/common/lib/dataset.py b/common/lib/dataset.py index c377f37b5..364fa167b 100644 --- a/common/lib/dataset.py +++ b/common/lib/dataset.py @@ -385,8 +385,8 @@ def iterate_items(self, processor=None, warn_unmappable=True, map_missing="defau # Add possible annotations if has_annotations: - # Get the annotation, if available. - post_annotations = self.get_annotations(item_ids=[mapped_item.data["id"]]) + # Get annotations for this specific post + post_annotations = self.get_annotations(item_id=mapped_item.data["id"]) # We're always handling annotated data as a MappedItem object, # even if no map_item() function is available for the data source. @@ -394,18 +394,16 @@ def iterate_items(self, processor=None, warn_unmappable=True, map_missing="defau mapped_item = MappedItem(mapped_item) for annotation_label in annotation_labels: - - annotation = "" - + value = "" for post_annotation in post_annotations: if post_annotation.label == annotation_label: - annotation = post_annotation.value - if isinstance(annotation, list): - annotation = ",".join(annotation) + value = post_annotation.value + if isinstance(value, list): + value = ",".join(value) - # We're always adding an annotation value, - # as an empty string if it's absent. - mapped_item.data[annotation_label] = annotation + # We're always adding an annotation value + # as an empty string, even if it's absent. + mapped_item.data[annotation_label] = value # yield a DatasetItem, which is a dict with some special properties yield DatasetItem(mapper=item_mapper, original=original_item, mapped_object=mapped_item, **(mapped_item.get_item_data() if type(mapped_item) is MappedItem else mapped_item)) @@ -1581,26 +1579,33 @@ def has_annotations(self) -> bool: return True if annotation else False - def get_annotations(self, item_ids=[]) -> list: + def get_annotations(self, item_id=[]) -> list: """ Retrieves the annotations for this dataset. - :param item_ids: A list of item IDs to get the annotations from. - If empty, get all the annotations for this dataset. + :param item_id: A list of item IDs to get the annotations from. + If empty, get all the annotations for this dataset. + May also be a string to get annotations from a specific item. return list: All annotations, each in their own dictionary. """ - if item_ids: - annotations = self.db.fetchall("SELECT * FROM annotations " - "WHERE dataset = %s AND item_id IN %s;", (self.key, tuple(item_ids))) + annotations = [] + if item_id: + if isinstance(item_id, str): + item_id = [item_id] + ids = self.db.fetchall("SELECT id FROM annotations WHERE dataset = %s AND item_id IN %s;", + (self.key, tuple(item_id))) else: - annotations = self.db.fetchall("SELECT * FROM annotations WHERE dataset = %s;", (self.key,)) + ids = self.db.fetchall("SELECT id FROM annotations WHERE dataset = %s;", (self.key,)) - if not annotations: - annotations = [] - else: - annotations = [Annotation(data=annotation, db=self.db) for annotation in annotations] + if not ids: + return [] + + ids = [i["id"] for i in ids] + + for id in ids: + annotations.append(Annotation.get_by_id(id, self.db)) return annotations From be8ac89c8323fffbb258119ed506d7f7fc1484f2 Mon Sep 17 00:00:00 2001 From: Sal Hagen Date: Mon, 12 Aug 2024 17:13:28 +0200 Subject: [PATCH 125/204] First steps to make new annotation system work with Explorer --- backend/lib/processor.py | 14 ++++++++- common/lib/annotation.py | 12 ++++---- common/lib/dataset.py | 3 +- webtool/templates/explorer/explorer.html | 2 ++ .../templates/explorer/post-annotations.html | 29 +++++++++++-------- webtool/templates/explorer/post.html | 3 -- webtool/views/views_explorer.py | 13 ++++++--- 7 files changed, 50 insertions(+), 26 deletions(-) diff --git a/backend/lib/processor.py b/backend/lib/processor.py index 5a0aaff52..3ee7704d1 100644 --- a/backend/lib/processor.py +++ b/backend/lib/processor.py @@ -3,6 +3,7 @@ """ import re import traceback +import hashlib import zipfile import typing import shutil @@ -738,13 +739,16 @@ def write_annotations(self, annotations: list, source_dataset=None, overwrite=Fa if not source_dataset: source_dataset = self.source_dataset + # Create a field ID based on the + # Check if this dataset already has annotation fields + field_id = "" existing_labels = source_dataset.get_annotation_field_labels() # Set some values for annotation in annotations: - # Set the default author and label to this processor's name + # Set the default label to this processor's name if not annotation.get("label"): # If the processor has already generated annotation fields, # add a number to differentiate the label @@ -752,6 +756,8 @@ def write_annotations(self, annotations: list, source_dataset=None, overwrite=Fa if not overwrite and label in existing_labels: label += "-" + str(len([l for l in existing_labels if l.startswith(label)])) annotation["label"] = label + + # Set the author to this processor's name if not annotation.get("author"): annotation["author"] = self.name @@ -762,6 +768,12 @@ def write_annotations(self, annotations: list, source_dataset=None, overwrite=Fa annotation["metadata"] = {} annotation["metadata"]["processor-parameters"] = self.parameters + if not annotation.get("field_id"): + if not field_id: + field_id = source_dataset.key + annotation["label"] + field_id = hashlib.md5(field_id.encode("utf-8")).hexdigest() + annotation["field_id"] = field_id + annotations_saved = source_dataset.save_annotations(annotations, overwrite=overwrite) return annotations_saved diff --git a/common/lib/annotation.py b/common/lib/annotation.py index 7da8a01a8..b36ae566c 100644 --- a/common/lib/annotation.py +++ b/common/lib/annotation.py @@ -109,7 +109,7 @@ def __init__(self, data=None, id=None, db=None): new_data = { "dataset": data["dataset"], "item_id": data["item_id"], - "field_id": data["field_id"] if data.get("field_id") else self.get_field_id(data["dataset"], data["label"]), + "field_id": data["field_id"] if data.get("field_id") else self.set_field_id(data["dataset"], data["label"]), "timestamp": 0, "timestamp_created": created_timestamp, "label": data["label"], @@ -161,14 +161,14 @@ def get_by_field(self, dataset_key: str, item_id: str, label: str) -> dict: """ data = self.db.fetchone("SELECT * FROM annotations WHERE dataset = %s AND item_id = %s AND label = %s", - (dataset_key, item_id, label)) + (dataset_key, str(item_id), label)) if not data: return {} data["metadata"] = json.loads(data["metadata"]) return data - def get_field_id(self, dataset_key: str, label: str) -> str: + def set_field_id(self, dataset_key: str, label: str) -> str: """ Sets a `field_id` based on the dataset key and label. This combination should be unique. @@ -176,9 +176,11 @@ def get_field_id(self, dataset_key: str, label: str) -> str: :param dataset_key: The dataset key :param label: The label of the dataset. """ - field_id = hashlib.md5(dataset_key + label.encode("utf-8")).hexdigest() + + field_id = source_dataset.key + annotation["label"] + field_id = hashlib.md5(field_id.encode("utf-8")).hexdigest() self.field_id = field_id - return field_id + return self.field_id def write_to_db(self): """ diff --git a/common/lib/dataset.py b/common/lib/dataset.py index 364fa167b..d1aeb9838 100644 --- a/common/lib/dataset.py +++ b/common/lib/dataset.py @@ -1592,8 +1592,9 @@ def get_annotations(self, item_id=[]) -> list: annotations = [] if item_id: - if isinstance(item_id, str): + if isinstance(item_id, str) or isinstance(item_id, int): item_id = [item_id] + item_id = [str(i) for i in item_id] ids = self.db.fetchall("SELECT id FROM annotations WHERE dataset = %s AND item_id IN %s;", (self.key, tuple(item_id))) else: diff --git a/webtool/templates/explorer/explorer.html b/webtool/templates/explorer/explorer.html index 137864b5e..eb31de870 100644 --- a/webtool/templates/explorer/explorer.html +++ b/webtool/templates/explorer/explorer.html @@ -26,6 +26,8 @@ + +{% set pseudonymised = True if dataset.parameters and dataset.parameters.get('pseudonymise', False) %} {% set key = dataset.data.key %} diff --git a/webtool/templates/explorer/post-annotations.html b/webtool/templates/explorer/post-annotations.html index e6f4c731f..94b57b47a 100644 --- a/webtool/templates/explorer/post-annotations.html +++ b/webtool/templates/explorer/post-annotations.html @@ -1,26 +1,31 @@
    {% if annotation_fields %} - {% set old_annotations = None %} {% if annotations and post.id in annotations %} - {% set old_annotations = annotations[post.id] %} + {% set post_annotations = annotations[post.id] %} {% endif %} {% for field in annotation_fields %} - {% set type = annotation_fields[field]["type"] %} + {% set type = annotation_fields[field]["type"] %} {% set label = annotation_fields[field]["label"] %} - {% set old_annotation = "" %} - {% if old_annotations and label in old_annotations %} - {% set old_annotation = old_annotations[label] %} - {% endif %} - + + {% set annotation = {} %} + {% for post_annotation in post_annotations %} + {{ field }} + {% if post_annotation.field_id == field %} + {% set annotation = post_annotation %} + {{ annotation }} + {% endif %} + {% endfor %} + {{ annotation }} +
    {% if type == 'text' %} - + {% elif type == 'textarea' %} - + {% elif type == 'dropdown' %} @@ -38,7 +43,7 @@ {% for option in annotation_fields[field]["options"] %} {% set option_id = option.keys() | first %} {% set option_label = option.values() | first %} - {% set checked = "checked" if old_annotation and option_label in old_annotation else "" %} + {% set checked = "checked" if option_label in annotation.value else "" %} {% endfor %} diff --git a/webtool/templates/explorer/post.html b/webtool/templates/explorer/post.html index 52b045886..61074757b 100644 --- a/webtool/templates/explorer/post.html +++ b/webtool/templates/explorer/post.html @@ -1,8 +1,5 @@
  • - - {% set pseudonymised = True if dataset.parameters and dataset.parameters.get('pseudonymise', False) %} - {% if template == "datasource" %} {% include "explorer/datasource-templates/" + datasource + ".html" %} diff --git a/webtool/views/views_explorer.py b/webtool/views/views_explorer.py index a3acb82d3..b0ba04696 100644 --- a/webtool/views/views_explorer.py +++ b/webtool/views/views_explorer.py @@ -89,6 +89,9 @@ def explorer_dataset(key, page=1): posts = [] count = 0 + # Load annotations with post IDs as keys and their annotations as lists. + annotations = {} + # We don't need to sort if we're showing the existing dataset order (the default). # If we're sorting, we need to iterate over the entire dataset first. if not sort or (sort == "dataset-order" and reverse == False): @@ -120,6 +123,12 @@ def explorer_dataset(key, page=1): if not posts: return error(404, error="No posts or posts could not be displayed") + # Check whether there's already annotations made. + # If so, also pass these to the template and set the post ID + # as key so we can easily retrieve them. + for post_id in post_ids: + annotations[post_id] = dataset.get_annotations(item_id=post_id) + # We can use either a generic or a pre-made data source-specific template. template = "datasource" if has_datasource_template(datasource) else "generic" if template == "generic": @@ -130,10 +139,6 @@ def explorer_dataset(key, page=1): with open(posts_css, "r", encoding="utf-8") as css: posts_css = css.read() - # Check whether there's already annotations inserted already. - # If so, also pass these to the template. - annotations = dataset.get_annotations() - # Generate the HTML page return render_template("explorer/explorer.html", dataset=dataset, datasource=datasource, has_database=has_database, posts=posts, annotation_fields=annotation_fields, annotations=annotations, template=template, posts_css=posts_css, page=page, offset=offset, posts_per_page=posts_per_page, post_count=post_count, max_posts=max_posts, warning=warning) From f0e61c3e00a83d1e2642e687aea73427268f8133 Mon Sep 17 00:00:00 2001 From: Sal Hagen Date: Tue, 13 Aug 2024 19:30:13 +0200 Subject: [PATCH 126/204] Make annotations editable and saveable in Explorer --- common/lib/annotation.py | 22 +- common/lib/dataset.py | 8 +- webtool/static/js/explorer.js | 83 +++---- .../templates/explorer/post-annotations.html | 34 ++- webtool/views/views_explorer.py | 222 +++++++++--------- 5 files changed, 207 insertions(+), 162 deletions(-) diff --git a/common/lib/annotation.py b/common/lib/annotation.py index b36ae566c..aadee1205 100644 --- a/common/lib/annotation.py +++ b/common/lib/annotation.py @@ -86,6 +86,9 @@ def __init__(self, data=None, id=None, db=None): # If we were able to retrieve an annotation from the db, it already exists if current: + + #current["metadata"] = json.loads(current["metadata"]) + # Check if we have to overwrite old data with new data if data: for key, value in data.items(): @@ -110,7 +113,7 @@ def __init__(self, data=None, id=None, db=None): "dataset": data["dataset"], "item_id": data["item_id"], "field_id": data["field_id"] if data.get("field_id") else self.set_field_id(data["dataset"], data["label"]), - "timestamp": 0, + "timestamp": created_timestamp, "timestamp_created": created_timestamp, "label": data["label"], "type": data.get("type", "text"), @@ -124,7 +127,22 @@ def __init__(self, data=None, id=None, db=None): self.data = new_data new_or_updated = True + if isinstance(self.data["metadata"], str): + try: + self.metadata = json.loads(self.data["metadata"]) + except (TypeError, json.JSONDecodeError): + self.metadata = {} + for k, v in self.data.items(): + # Some type checking + try: + if k == "timestamp" or k == "timestamp_created": + v = int(v) + elif k == "by_processor": + v = bool(v) + except ValueError as e: + raise AnnotationException("Annotation fields are not of the right type (%s)" % e) + self.__setattr__(k, v) # Write to db if anything changed @@ -400,9 +418,9 @@ def __setattr__(self, attr, value): if attr == "metadata": value = json.dumps(value) + self.timestamp = int(time.time()) self.db.update("annotations", where={"id": self.id}, data={attr: value}) self.data[attr] = value - if attr == "metadata": self.metadata = json.loads(value) \ No newline at end of file diff --git a/common/lib/dataset.py b/common/lib/dataset.py index d1aeb9838..56aeb5b5e 100644 --- a/common/lib/dataset.py +++ b/common/lib/dataset.py @@ -1591,13 +1591,17 @@ def get_annotations(self, item_id=[]) -> list: """ annotations = [] + + # Get annotation IDs first if item_id: + # Get specific annotations if IDs are given if isinstance(item_id, str) or isinstance(item_id, int): item_id = [item_id] item_id = [str(i) for i in item_id] ids = self.db.fetchall("SELECT id FROM annotations WHERE dataset = %s AND item_id IN %s;", (self.key, tuple(item_id))) else: + # Else just get all the annotation data from this dataset ids = self.db.fetchall("SELECT id FROM annotations WHERE dataset = %s;", (self.key,)) if not ids: @@ -1605,6 +1609,7 @@ def get_annotations(self, item_id=[]) -> list: ids = [i["id"] for i in ids] + # Then get the annotations by ID for id in ids: annotations.append(Annotation.get_by_id(id, self.db)) @@ -1661,8 +1666,7 @@ def save_annotations(self, annotations: list, overwrite=True) -> int: :param list annotations: List of dictionaries with annotation items. Must have `item_id` and `label`. E.g. [{"item_id": "12345", "label": "Valid", "value": "Yes"}] - :param bool overwrite: Whether to overwrite annotation if the label is already present - for the dataset. + :param bool overwrite: Whether to overwrite the annotation if it already present. :returns int: How many annotations were saved. diff --git a/webtool/static/js/explorer.js b/webtool/static/js/explorer.js index 35104d28b..6648c8ced 100644 --- a/webtool/static/js/explorer.js +++ b/webtool/static/js/explorer.js @@ -317,7 +317,7 @@ const annotations = { return annotation_fields; }, - parseAnnotation: function(e) { + parseAnnotation: function(el) { /* Converts the DOM objects of an annotation field to an annotation Object. @@ -326,34 +326,35 @@ const annotations = { */ - annotation = {} + let ann = $(el) + let field_id = ann.attr("class").split(" ")[1].replace("field-", ""); + let annotation_type = ann.attr("class").split(" ")[2].replace("type-", ""); + let item_id = ann.attr("class").split(" ")[3].replace("item-id-", ""); + let author = "Jan" + let label = ann.find(".annotation-label").text(); - let label = $(this).find(".annotation-label").text(); - let annotation_type = $(this).attr("class").split(" ").pop(); let val = undefined; let edited = false - let timestamp = Date.now() / 100 if (annotation_type === "text" || annotation_type === "textarea") { - val = $(this).find(".post-annotation-input").val(); + val = ann.find(".post-annotation-input").val(); // It can be the case that the input text is deleted // In this case we *do* want to push new data, so we check // whether there's an 'edited' class present and save if so. - if ($(this).find(".post-annotation-input").hasClass("edited")) { + if (ann.find(".post-annotation-input").hasClass("edited")) { edited = true } } else if (annotation_type === "dropdown") { - let selected = $(this).find(".post-annotation-options").val(); - val = selected; + val = ann.find(".post-annotation-options").val(); } else if (annotation_type === "checkbox") { val = []; - $(this).find(".post-annotation-options > input").each(function(){ - if ($(this).is(":checked")) { - val.push($(this).val()); + ann.find(".post-annotation-options > input").each(function(){ + if (ann.is(":checked")) { + val.push(ann.val()); } - if ($(this).hasClass("edited")) { + if (ann.hasClass("edited")) { edited = true } }); @@ -361,27 +362,27 @@ const annotations = { val = undefined; } } - if ((val !== undefined && val !== "") || edited) { + /*if ((val !== undefined && val !== "") || edited) { vals_changed = true; val = ""; - } + console.log("EDITED") + }*/ + /*if (vals_changed){ + annotation[post_id] = post_vals; + } +*/ // Create an annotation object and add them to the array. let annotation = { - "field_id": "", - "post_id": post_id, - "dataset": "", - "timestamp": timestamp, - "timestamp_created": "", + "field_id": field_id, + "item_id": item_id, "label": label, "type": annotation_type, - "options": "", - "value": "", - "author": "", - "by_processor": "", - "metadata": "" + "value": val, + "author": author, + "by_processor": false // Explorer annotations are human-made! } - + console.log(annotation) return annotation }, @@ -494,7 +495,7 @@ const annotations = { // For dropdowns and checkboxes, we're checking whether we // have to add or change any of their options. - else if (input_type == "checkbox" || input_type == "dropdown"){ + else if (input_type === "checkbox" || input_type === "dropdown"){ let options = annotation_fields[field].options; let valid_options = []; @@ -521,10 +522,10 @@ const annotations = { let post_id = $(this).parents("li").attr("id").split("post-")[1]; post_option_id = post_id + "-" + option_id; - if (input_type == "dropdown") { + if (input_type === "dropdown") { $(this).append(""); } - else if (input_type == "checkbox") { + else if (input_type === "checkbox") { $(this).append(""); } }); @@ -577,15 +578,15 @@ const annotations = { el = "
    "; // Add a text input for text fields - if (input_type == "text") { + if (input_type === "text") { el += ""; } - else if (input_type == "textarea") { + else if (input_type === "textarea") { el += ""; } // Add a dropdown for dropdown fields - else if (input_type == "dropdown") { + else if (input_type === "dropdown") { el += " @@ -34,7 +34,7 @@ {% for option in annotation_fields[field]["options"] %} {% set option_id = option.keys() | first %} {% set option_label = option.values() | first %} - + {% endfor %} @@ -50,6 +50,24 @@
    {% endif %} + {# Tooltip with metadata on the annotation #} + {% if annotation.author or annotation.timestamp or annotation.metadata %} + + + {% endif %} + {% endif %}
  • {% endfor %} {% endif %} diff --git a/webtool/views/views_explorer.py b/webtool/views/views_explorer.py index b0ba04696..88a2b64d8 100644 --- a/webtool/views/views_explorer.py +++ b/webtool/views/views_explorer.py @@ -3,8 +3,6 @@ format and lets users annotate the data. """ -import json - from pathlib import Path from flask import request, render_template @@ -12,7 +10,6 @@ from webtool import app, db, openapi, limiter, config from webtool.lib.helpers import error, setting_required from common.lib.dataset import DataSet -from common.lib.annotation import Annotation from common.lib.helpers import convert_to_float from common.lib.exceptions import DataSetException from common.config_manager import ConfigWrapper @@ -20,13 +17,13 @@ config = ConfigWrapper(config, user=current_user, request=request) api_ratelimit = limiter.shared_limit("45 per minute", scope="api") -@app.route('/results//explorer/', defaults={'page': 1}) -@app.route('/results//explorer/page/') +@app.route('/results//explorer/', defaults={'page': 1}) +@app.route('/results//explorer/page/') @api_ratelimit @login_required @setting_required("privileges.can_use_explorer") @openapi.endpoint("explorer") -def explorer_dataset(key, page=1): +def explorer_dataset(dataset_key: str, page=1): """ Show posts from a dataset @@ -39,7 +36,7 @@ def explorer_dataset(key, page=1): # Get dataset info. try: - dataset = DataSet(key=key, db=db) + dataset = DataSet(key=dataset_key, db=db) except DataSetException: return error(404, error="Dataset not found.") @@ -48,7 +45,6 @@ def explorer_dataset(key, page=1): datasource = parameters["datasource"] post_count = int(dataset.data["num_rows"]) annotation_fields = dataset.get_annotation_fields() - datasource_config = config.get("explorer.config", {}).get(datasource,{}) warning = "" # See if we can actually serve this page @@ -76,7 +72,7 @@ def explorer_dataset(key, page=1): # If the dataset is generated from an API-accessible database, we can add # extra features like the ability to navigate across posts. - has_database = False # INTEGRATE LATER ///////////////////// + has_database = False # todo: integrate # Check if we have to sort the data. sort = request.args.get("sort") @@ -92,7 +88,7 @@ def explorer_dataset(key, page=1): # Load annotations with post IDs as keys and their annotations as lists. annotations = {} - # We don't need to sort if we're showing the existing dataset order (the default). + # We don't need to sort if we're showing the existing dataset order (default). # If we're sorting, we need to iterate over the entire dataset first. if not sort or (sort == "dataset-order" and reverse == False): for row in dataset.iterate_items(warn_unmappable=False): @@ -125,16 +121,17 @@ def explorer_dataset(key, page=1): # Check whether there's already annotations made. # If so, also pass these to the template and set the post ID - # as key so we can easily retrieve them. + # as key, so we can easily retrieve them. for post_id in post_ids: annotations[post_id] = dataset.get_annotations(item_id=post_id) - # We can use either a generic or a pre-made data source-specific template. + # We can use either a generic or a pre-made, data source-specific template. template = "datasource" if has_datasource_template(datasource) else "generic" if template == "generic": posts_css = Path(config.get('PATH_ROOT'), "webtool/static/css/explorer/generic.css") else: posts_css = Path(config.get('PATH_ROOT'), "webtool/static/css/explorer/" + datasource + ".css") + # Read CSS and pass as a string with open(posts_css, "r", encoding="utf-8") as css: posts_css = css.read() @@ -142,89 +139,17 @@ def explorer_dataset(key, page=1): # Generate the HTML page return render_template("explorer/explorer.html", dataset=dataset, datasource=datasource, has_database=has_database, posts=posts, annotation_fields=annotation_fields, annotations=annotations, template=template, posts_css=posts_css, page=page, offset=offset, posts_per_page=posts_per_page, post_count=post_count, max_posts=max_posts, warning=warning) -@app.route('/results///explorer') -@api_ratelimit -@login_required -@setting_required("privileges.can_use_explorer") -@openapi.endpoint("explorer") -def explorer_api_thread(datasource, thread_id): - """ - /// INTEGRATE LATER! - - Show a thread from an API-accessible database. - - :param str datasource: Data source ID - :param str board: Board name - :param int thread_id: Thread ID - - :return-error 404: If the thread ID does not exist for the given data source. - """ - - if not datasource: - return error(404, error="No datasource provided") - if datasource not in config.get('datasources.enabled'): - return error(404, error="Invalid data source") - if not thread_id: - return error(404, error="No thread ID provided") - - # The amount of posts that may be included (limit for large datasets) - max_posts = config.get('explorer.max_posts', 500000) - - # Get the posts with this thread ID. - posts = get_local_posts(db, datasource, ids=tuple([thread_id]), threads=True, order_by=["id"]) - - if not posts: - return error(404, error="No posts available for this thread") - - posts = [strip_html(post) for post in posts] - posts = [format(post, datasource=datasource) for post in posts] - - return render_template("explorer/explorer.html", datasource=datasource, posts=posts, datasource_config=datasource_config, posts_per_page=len(posts), post_count=len(posts), thread=thread_id, max_posts=max_posts) - -@app.route('/explorer/post///') -@api_ratelimit -@login_required -@setting_required("privileges.can_use_explorer") -@openapi.endpoint("explorer") -def explorer_api_posts(datasource, post_ids): - """ - /// INTEGRATE LATER - - Show posts from an API-accessible database. - - :param str datasource: Data source ID - :param str board: Board name - :param int post_ids: Post IDs - - :return-error 404: If the thread ID does not exist for the given data source. - """ - - if not datasource: - return error(404, error="No datasource provided") - if datasource not in config.get('datasources.enabled'): - return error(404, error="Invalid data source") - if not post_ids: - return error(404, error="No thread ID provided") - - # Get the posts with this thread ID. - posts = get_database_posts(db, datasource, board=board, ids=tuple([post_ids]), threads=True, order_by=["id"]) - - posts = [strip_html(post) for post in posts] - posts = [format(post) for post in posts] - - return render_template("explorer/explorer.html", datasource=datasource, board=board, posts=posts, datasource_config=datasource_config, posts_per_page=len(posts), post_count=len(posts)) - @app.route("/explorer/save_annotation_fields/", methods=["POST"]) @api_ratelimit @login_required @setting_required("privileges.can_run_processors") @setting_required("privileges.can_use_explorer") @openapi.endpoint("explorer") -def explorer_save_annotation_fields(key): +def explorer_save_annotation_fields(key: str) -> int: """ Save the annotation fields of a dataset to the datasets table. - :param str key: The dataset key. + :param key: The dataset key. :return-error 404: If the dataset ID does not exist. :return int: The number of annotation fields saved. @@ -244,38 +169,46 @@ def explorer_save_annotation_fields(key): return "success" -@app.route("/explorer/save_annotations/", methods=["POST"]) +@app.route("/explorer/save_annotations/", methods=["POST"]) @api_ratelimit @login_required @setting_required("privileges.can_run_processors") @setting_required("privileges.can_use_explorer") @openapi.endpoint("explorer") -def explorer_save_annotations(key=None): +def explorer_save_annotations(dataset_key: str): """ Save the annotations of a dataset to the annotations table. - :param str key: The dataset key. + :param dataset_key: The dataset key. Must be explicitly given to ensure + annotations are tied to a dataset + + :return-error 404: If the dataset key does not exist. - :return-error 404: If the dataset key does not exist. - :return int: The number of posts with annotations saved. """ # Save it! annotations = request.get_json() - - # Annotations are always associated with a dataset. - if not key and annotations: - key = annotations[0].get("dataset", "") - if not key: - return error(404, error="No dataset key provided") try: - dataset = DataSet(key=key, db=db) + dataset = DataSet(key=dataset_key, db=db) except DataSetException: return error(404, error="Dataset not found.") - - return dataset.save_annotations(annotations) -def sort_and_iterate_items(dataset, sort=None, reverse=False, **kwargs): + dataset.save_annotations(annotations, overwrite=True) + return "success" + +@app.route("/explorer/save_annotation/", methods=["POST"]) +@api_ratelimit +@login_required +@setting_required("privileges.can_run_processors") +@setting_required("privileges.can_use_explorer") +@openapi.endpoint("explorer") +def explorer_save_annotation(key="") -> int: + """ + todo: integrate + """ + return 0 + +def sort_and_iterate_items(dataset: DataSet, sort=None, reverse=False, **kwargs) -> dict: """ Loop through both csv and NDJSON files. This is basically a wrapper function for `iterate_items()` with the @@ -284,9 +217,11 @@ def sort_and_iterate_items(dataset, sort=None, reverse=False, **kwargs): This first iterates through the entire file (with a max limit) to determine an order. Then it yields items based on this order. - :param key, str: The dataset object. + :param dataset, str: The dataset object. :param sort_by, str: The item key that determines the sort order. :param reverse, bool: Whether to sort by largest values first. + + :returns dict: Yields iterated post """ # Storing posts in the right order here @@ -314,6 +249,7 @@ def sort_and_iterate_items(dataset, sort=None, reverse=False, **kwargs): def get_database_posts(db, datasource, ids, board="", threads=False, limit=0, offset=0, order_by=["timestamp"]): """ + todo: Integrate later Retrieve posts by ID from a database-accessible data source. """ @@ -335,19 +271,93 @@ def get_database_posts(db, datasource, ids, board="", threads=False, limit=0, of return posts -def has_datasource_template(datasource): +def has_datasource_template(datasource: str) -> bool: """ Check if the data source has a data source-specific template. This requires HTML and CSS files. Custom HTML files should be placed in `webtool/templates/explorer/datasource-templates/.html`. Custom CSS files should be placed in `webtool/static/css/explorer/.css`. - :param datasource, str: Datasource name. - :return: bool, Whether the required files are present. + :param datasource: Datasource name. + + :returns: bool, Whether the required files are present. """ css_exists = Path(config.get('PATH_ROOT'), "webtool/static/css/explorer/" + datasource + ".css").exists() html_exists = Path(config.get('PATH_ROOT'), "webtool/templates/explorer/datasource-templates/" + datasource + ".html").exists() if css_exists and html_exists: return True - return False \ No newline at end of file + return False + +@app.route('/results///explorer') +@api_ratelimit +@login_required +@setting_required("privileges.can_use_explorer") +@openapi.endpoint("explorer") +def explorer_api_thread(datasource, thread_id): + """ + todo: INTEGRATE LATER! + + Show a thread from an API-accessible database. + + :param str datasource: Data source ID + :param str board: Board name + :param int thread_id: Thread ID + + :return-error 404: If the thread ID does not exist for the given data source. + """ + + if not datasource: + return error(404, error="No datasource provided") + if datasource not in config.get('datasources.enabled'): + return error(404, error="Invalid data source") + if not thread_id: + return error(404, error="No thread ID provided") + + # The amount of posts that may be included (limit for large datasets) + max_posts = config.get('explorer.max_posts', 500000) + + # Get the posts with this thread ID. + #todo: define function get_api_posts + posts = get_api_posts(db, datasource, ids=tuple([thread_id]), threads=True, order_by=["id"]) + + if not posts: + return error(404, error="No posts available for this thread") + + posts = [strip_html(post) for post in posts] + posts = [format(post, datasource=datasource) for post in posts] + + return render_template("explorer/explorer.html", datasource=datasource, posts=posts, datasource_config=datasource_config, posts_per_page=len(posts), post_count=len(posts), thread=thread_id, max_posts=max_posts) + +@app.route('/explorer/post///') +@api_ratelimit +@login_required +@setting_required("privileges.can_use_explorer") +@openapi.endpoint("explorer") +def explorer_api_posts(datasource, post_ids): + """ + todo: INTEGRATE LATER + + Show posts from an API-accessible database. + + :param str datasource: Data source ID + :param str board: Board name + :param int post_ids: Post IDs + + :return-error 404: If the thread ID does not exist for the given data source. + """ + + if not datasource: + return error(404, error="No datasource provided") + if datasource not in config.get('datasources.enabled'): + return error(404, error="Invalid data source") + if not post_ids: + return error(404, error="No thread ID provided") + + # Get the posts with this thread ID. + posts = get_database_posts(db, datasource, board=board, ids=tuple([post_ids]), threads=True, order_by=["id"]) + + posts = [strip_html(post) for post in posts] + posts = [format(post) for post in posts] + + return render_template("explorer/explorer.html", datasource=datasource, board=board, posts=posts, datasource_config=datasource_config, posts_per_page=len(posts), post_count=len(posts)) \ No newline at end of file From 28032b5ad820f27edd64e7db203391e461b68eb9 Mon Sep 17 00:00:00 2001 From: Sal Hagen Date: Mon, 19 Aug 2024 13:08:03 +0200 Subject: [PATCH 127/204] Make Tumblr search code a bit neater --- datasources/tumblr/search_tumblr.py | 239 +++++++++++++++------------- 1 file changed, 126 insertions(+), 113 deletions(-) diff --git a/datasources/tumblr/search_tumblr.py b/datasources/tumblr/search_tumblr.py index 0f696507b..ddf02f023 100644 --- a/datasources/tumblr/search_tumblr.py +++ b/datasources/tumblr/search_tumblr.py @@ -15,7 +15,6 @@ import json from requests.exceptions import ConnectionError from datetime import datetime -from ural import urls_from_text from common.config_manager import config from backend.lib.search import Search @@ -23,12 +22,12 @@ from common.lib.exceptions import QueryParametersException, ProcessorInterruptedException, ConfigException from common.lib.item_mapping import MappedItem - __author__ = "Sal Hagen" __credits__ = ["Sal Hagen", "Tumblr API (api.tumblr.com)"] __maintainer__ = "Sal Hagen" __email__ = "4cat@oilab.eu" + class SearchTumblr(Search): """ Tumblr data filter module. @@ -38,15 +37,17 @@ class SearchTumblr(Search): title = "Search Tumblr" # title displayed in UI description = "Retrieve Tumblr posts by tags or blogs." # description displayed in UI extension = "ndjson" # extension of result file, used internally and in UI - is_local = False # Whether this datasource is locally scraped - is_static = False # Whether this datasource is still updated + is_local = False # Whether this datasource is locally scraped + is_static = False # Whether this datasource is still updated # not available as a processor for existing datasets accepts = [None] max_workers = 1 - max_retries = 3 # For API and connection retries. - max_date_retries = 96 + 150 # For checking dates. 96 time retries of -6 hours (24 days), plus 150 extra for 150 weeks (~3 years). + # For API and connection retries. + max_retries = 3 + # For checking dates. 96 retries of -6 hours (24 days), plus 150 extra for 150 weeks (~3 years). + max_date_retries = 96 + 150 max_posts = 1000000 max_reblogs = 1000 @@ -109,23 +110,21 @@ def get_options(cls, parent_dataset=None, user=None): "query": { "type": UserInput.OPTION_TEXT_LARGE, "help": "Tags, blogs, or post URLs.", - "tooltip": " Seperate with comma or newline. Example:\n#research tools, @4catblog, https://tumblr.com/4catblog/12347714095" + "tooltip": "Seperate with comma or newline, e.g.: #research tools, @4catblog, https://tumblr.com/4catblog/123456789" }, "get_notes": { "type": UserInput.OPTION_TOGGLE, "help": "Add note data (warning: slow)", "tooltip": "Add note data for every post. This includes note metrics, " - "replies, reblogged text, and reblogged images. " - "Blog- and id-level search includes reblogged text by default. " - "Enables adding reblogs as new posts " - "Limited to the first 1,000 reblogs per post.", + "replies, reblogged text, and reblogged images. " + "Blog- and id-level search includes reblogged text by default. " + "Limited to the first 1,000 reblogs per post.", "default": False }, "get_reblogs": { "type": UserInput.OPTION_TOGGLE, "help": "Add reblogs", - "tooltip": "Add reblogs to the dataset. " - "", + "tooltip": "Add reblogs of initially captured posts as new posts to the dataset. ", "requires": "get_notes==true", "default": False }, @@ -149,7 +148,7 @@ def get_options(cls, parent_dataset=None, user=None): } try: - config_keys = SearchTumblr.get_tumblr_keys(user) + SearchTumblr.get_tumblr_keys(user) except ConfigException: # No 4CAT set keys for user; let user input their own options["key-info"] = { @@ -185,24 +184,24 @@ def get_options(cls, parent_dataset=None, user=None): } options["divider"] = { - "type": UserInput.OPTION_DIVIDER - } + "type": UserInput.OPTION_DIVIDER + } options["date-intro"] = { - "type": UserInput.OPTION_INFO, - "help": "**Note:** The [Tumblr API](https://api.tumblr.com) is very volatile. Queries may not return " - "posts, even if posts exists. Waiting for a while and querying again can help, even with identical queries. " - "Consider carrying out multiple queries and using the 'Merge datasets' processor to limit false negatives.\n\n" - "Additionally, older tagged posts may not be returned, even if they exist. To mitigate this, 4CAT decreases " - "the date parameter (before) with six hours and sends the query again. This often " - "successfully returns older, un-fetched posts. If it didn't find new data after 96 retries (24 " - "days), it checks for data up to six years before the last date, decreasing 12 times by 6 months. " - "If that also results in nothing, it assumes the dataset is complete. Check the oldest post in " - "your dataset to see if it this is indeed the case and whether any odd time gaps exists." - } + "type": UserInput.OPTION_INFO, + "help": "**Note:** The [Tumblr API](https://api.tumblr.com) is very volatile. Queries may not return " + "posts, even if posts exists. Waiting for a while and querying again can help, even with identical queries. " + "Consider carrying out multiple queries and using the 'Merge datasets' processor to limit false negatives.\n\n" + "Additionally, older tagged posts may not be returned, even if they exist. To mitigate this, 4CAT decreases " + "the date parameter (before) with six hours and sends the query again. This often " + "successfully returns older, un-fetched posts. If it didn't find new data after 96 retries (24 " + "days), it checks for data up to six years before the last date, decreasing 12 times by 6 months. " + "If that also results in nothing, it assumes the dataset is complete. Check the oldest post in " + "your dataset to see if it this is indeed the case and whether any odd time gaps exists." + } options["daterange"] = { - "type": UserInput.OPTION_DATERANGE, - "help": "Date range" - } + "type": UserInput.OPTION_DATERANGE, + "help": "Date range" + } return options @@ -240,14 +239,16 @@ def get_items(self, query): # Connect to Tumblr API try: self.client = self.connect_to_tumblr() - except ConfigException as e: + except ConfigException: self.log.warning(f"Could not connect to Tumblr API: API keys invalid or not set") self.dataset.finish_with_error(f"Could not connect to Tumblr API: API keys invalid or not set") return except ConnectionRefusedError as e: client_info = self.client.info() self.log.warning(f"Could not connect to Tumblr API: {e}; client_info: {client_info}") - self.dataset.finish_with_error(f"Could not connect to Tumblr API: {client_info.get('meta', {}).get('status', '')} - {client_info.get('meta', {}).get('msg', '')}") + self.dataset.finish_with_error( + f"Could not connect to Tumblr API:" + f"{client_info.get('meta', {}).get('status', '')} - {client_info.get('meta', {}).get('msg', '')}") return # For each tag or blog, get posts @@ -270,20 +271,20 @@ def get_items(self, query): # Post URL elif "tumblr.com/" in query: - + try: # Format https://{blogname}.tumblr.com/post/{post_id} if "/post/" in query: blog_name = query.split(".tumblr.com")[0].replace("https://", "").replace("www.", "").strip() post_id = query.split("/")[-1].strip() - # May also be a slug string.. + # May also be a slug string. if not post_id.isdigit(): post_id = query.split("/")[-2].strip() # Format https://tumblr.com/{blogname}/{post_id} else: blog_and_id = query.split("tumblr.com/")[-1] - blog_and_id = blog_and_id.replace("blog/view/", "") # Sometimes present in the URL + blog_and_id = blog_and_id.replace("blog/view/", "") # Sometimes present in the URL blog_name, post_id = blog_and_id.split("/") if not post_id.isdigit(): post_id = query.split("/")[-2].strip() @@ -322,7 +323,7 @@ def get_items(self, query): # The post rail is stored in the trail list for trail_post in result.get("trail", []): # Some posts or blogs have been deleted; skip these - if not "broken_blog_name" in trail_post: + if "broken_blog_name" not in trail_post: if trail_post["post"]["id"] not in self.seen_ids: extra_posts.append({"blog": trail_post["blog"]["name"], "id": trail_post["post"]["id"]}) @@ -331,12 +332,13 @@ def get_items(self, query): # Blog-level searches already have some note data, like reblogged text, # but not everything (like replies), so we're going to retrieve these here as well. # Also store IDs of reblogs/reblogged posts that we want to add. - if get_notes: - # Create a dictionary with the `reblog_key` as key and notes as value. - # Notes are the same for all posts in a reblog chain. - # This means that we may not have to re-query the same data. - retrieved_notes = {} + # Create a dictionary with the `reblog_key` as key and notes as value. + # Notes are the same for all posts in a reblog chain. + # This means that we may not have to re-query the same data. + retrieved_notes = {} + + if get_notes: for i, post in enumerate(results): @@ -345,7 +347,7 @@ def get_items(self, query): if self.api_limit_reached: break - self.dataset.update_status("Retrieving notes for post %i/%i" % (i+1, len(results))) + self.dataset.update_status("Retrieving notes for post %i/%i" % (i + 1, len(results))) # We may have already encountered this note-chain # with a different post. @@ -368,31 +370,33 @@ def get_items(self, query): # Only gets first 1,000 replies or text/tag reblogs. # We're using different querying modes since - # it'll speed up the process. The fastest is + # it'll speed up the process. The fastest is # `conversation`, which prioritises text reblogs and # replies, and also provides metrics on like and reblog counts; # we'll use this as default. If the user # has indicated they also want to add reblogs with tags, # we'll also use the `reblogs_with_tags` mode. seen_notes = set() - notes = self.get_notes(post["blog_name"], post["id"], mode="conversation", max_reblogs=self.max_reblogs) + notes = self.get_notes(post["blog_name"], post["id"], mode="conversation", + max_reblogs=self.max_reblogs) reblog_count = 0 for note in notes["notes"]: - if note["type"] == "reblog": # Replies don't have IDs + if note["type"] == "reblog": # Replies don't have IDs reblog_count += 1 seen_notes.add(note["post_id"]) # Get tag-only reblogs; these aren't returned in `conversation` mode. if reblog_type == "text_or_tag" and reblog_count <= self.max_reblogs: - tag_notes = self.get_notes(post["blog_name"], post["id"], mode="reblogs_with_tags", max_reblogs=self.max_reblogs - reblog_count) + tag_notes = self.get_notes(post["blog_name"], post["id"], mode="reblogs_with_tags", + max_reblogs=self.max_reblogs - reblog_count) for tag_note in tag_notes["notes"]: if tag_note["post_id"] not in seen_notes: notes["notes"].append(tag_note) - + # Add to posts results[i] = {**results[i], **notes} retrieved_notes[post["reblog_key"]] = notes - + # Identify which notes/reblogs we can collect as new posts if get_reblogs: @@ -411,24 +415,25 @@ def get_items(self, query): continue extra_posts.append({"blog": note["blog_name"], "id": note["post_id"]}) - + # Add reblogged posts and reblogs to dataset for i, extra_post in enumerate(extra_posts): - + self.dataset.update_status("Adding %s/%s reblogs to the dataset" % (i, len(extra_posts))) if extra_post["id"] not in self.seen_ids: - + # Potentially skip new posts outside of the date range # not always present in the notes data. if not reblog_outside_daterange and (max_date and min_date): - new_post = self.get_posts_by_blog(extra_post["blog"], extra_post["id"], max_date=max_date, min_date=min_date) + new_post = self.get_posts_by_blog(extra_post["blog"], extra_post["id"], max_date=max_date, + min_date=min_date) else: new_post = self.get_posts_by_blog(extra_post["blog"], extra_post["id"]) if new_post: new_post = new_post[0] - + # Add note data; these are already be retrieved above if get_notes: new_post = {**new_post, **retrieved_notes[new_post["reblog_key"]]} @@ -442,9 +447,10 @@ def get_items(self, query): def get_posts_by_tag(self, tag, max_date=None, min_date=None, api_key=None): """ Get Tumblr posts posts with a certain tag. - :param tag, str: the tag you want to look for + :param tag: the tag you want to look for :param min_date: a unix timestamp, indicates posts should be min_date this date. :param max_date: a unix timestamp, indicates posts should be max_date this date. + :param api_key: The api key. :returns: a dict created from the JSON response """ @@ -455,7 +461,7 @@ def get_posts_by_tag(self, tag, max_date=None, min_date=None, api_key=None): retries = 0 date_retries = 0 - # We're gonna change max_date, so store a copy for reference. + # We're going to change max_date, so store a copy for reference. max_date_original = max_date # We use the average time difference between posts to spot possible gaps in the data. @@ -479,7 +485,7 @@ def get_posts_by_tag(self, tag, max_date=None, min_date=None, api_key=None): break try: - # PyTumblr does not allow to use the `npf` parameter yet + # PyTumblr does not allow to use the `npf` parameter yet # for the `tagged` endpoint (opened a pull request), so # we're using requests here. params = { @@ -494,19 +500,19 @@ def get_posts_by_tag(self, tag, max_date=None, min_date=None, api_key=None): url = "https://api.tumblr.com/v2/tagged" response = requests.get(url, params=params) posts = response.json()["response"] - + except ConnectionError: - self.update_status("Encountered a connection error, waiting 10 seconds") + self.dataset.update_status("Encountered a connection error, waiting 10 seconds") time.sleep(10) retries += 1 continue - # Skip posts that we already enountered, + # Skip posts that we already encountered, # preventing Tumblr API shenanigans or double posts because of # time reductions. Make sure it's no error string, though. new_posts = [] for post in posts: - # Sometimes the API repsonds just with "meta", "response", or "errors". + # Sometimes the API responds just with "meta", "response", or "errors". if isinstance(post, str): self.dataset.update_status("Couldn't add post:", post) retries += 1 @@ -537,17 +543,18 @@ def get_posts_by_tag(self, tag, max_date=None, min_date=None, api_key=None): date_retries += 1 - # We're first gonna check carefully if there's small time gaps by + # We're first going to check carefully if there's small time gaps by # decreasing by six hours. # If that didn't result in any new posts, also dedicate 12 date_retries # with reductions of six months, just to be sure there's no data from # years earlier missing. if date_retries < 96: - max_date -= 21600 # Decrease by six hours + max_date -= 21600 # Decrease by six hours elif date_retries <= self.max_date_retries: - max_date -= 604800 # Decrease by one week - self.dataset.update_status("No new posts found for #%s - looking for posts before %s" % (tag, datetime.fromtimestamp(max_date).strftime("%Y-%m-%d %H:%M:%S"))) + max_date -= 604800 # Decrease by one week + self.dataset.update_status("No new posts found for #%s - looking for posts before %s" % ( + tag, datetime.fromtimestamp(max_date).strftime("%Y-%m-%d %H:%M:%S"))) # We can stop when the max date drops below the min date. if min_date != 0: @@ -587,7 +594,8 @@ def get_posts_by_tag(self, tag, max_date=None, min_date=None, api_key=None): if len(all_posts) >= 250 and time_dif > (avg_time_dif * 5): time_str = datetime.fromtimestamp(date).strftime("%Y-%m-%d %H:%M:%S") - self.dataset.update_status("Time difference of %s spotted, restarting query at %s" % (str(time_dif), time_str,)) + self.dataset.update_status( + "Time difference of %s spotted, restarting query at %s" % (str(time_dif), time_str,)) posts = [post for post in posts if post["timestamp"] >= date] if posts: all_posts += posts @@ -607,7 +615,7 @@ def get_posts_by_tag(self, tag, max_date=None, min_date=None, api_key=None): if max_date < min_date: # Get rid of all the posts that are earlier than the max_date timestamp - posts = [post for post in posts if post["timestamp"] >= min_date and post["timestamp"] <= max_date_original] + posts = [post for post in posts if min_date <= post["timestamp"] <= max_date_original] if posts: all_posts += posts @@ -636,7 +644,8 @@ def get_posts_by_tag(self, tag, max_date=None, min_date=None, api_key=None): self.max_posts_reached = True break - self.dataset.update_status("Collected %s posts for #%s, retrieving posts before %s" % (str(len(all_posts)), tag, max_date_str,)) + self.dataset.update_status( + "Collected %s posts for #%s, retrieving posts before %s" % (str(len(all_posts)), tag, max_date_str,)) time.sleep(.2) return all_posts @@ -644,10 +653,10 @@ def get_posts_by_tag(self, tag, max_date=None, min_date=None, api_key=None): def get_posts_by_blog(self, blog, post_id=None, max_date=None, min_date=None): """ Get Tumblr posts from a certain blog - :param blog, str: the name of the blog you want to look for - :param post_id, str: the post ID (optional) - :param max_date: a unix timestamp, indicates posts should be max_date this date. - :param min_date: a unix timestamp, indicates posts should be min_date this date. + :param blog: the name of the blog you want to look for + :param post_id: the post ID (optional) + :param max_date: a unix timestamp, indicates posts should be max_date this date. + :param min_date: a unix timestamp, indicates posts should be min_date this date. :returns: a dict created from the JSON response """ @@ -656,7 +665,7 @@ def get_posts_by_blog(self, blog, post_id=None, max_date=None, min_date=None): if post_id: try: - test_id = int(post_id) + int(post_id) except TypeError: raise QueryParametersException("Post ID %s is invalid" % post_id) @@ -668,7 +677,7 @@ def get_posts_by_blog(self, blog, post_id=None, max_date=None, min_date=None): # Some retries to make sure the Tumblr API actually returns everything retries = 0 - self.max_retries = 48 # 2 days + self.max_retries = 48 # 2 days # Get Tumblr posts until there's no more left. while True: @@ -682,7 +691,8 @@ def get_posts_by_blog(self, blog, post_id=None, max_date=None, min_date=None): try: # Use the pytumblr library to make the API call - posts = self.client.posts(blog, id=post_id, before=max_date, limit=20, reblog_info=True, notes_info=True, filter="raw", npf=True) + posts = self.client.posts(blog, id=post_id, before=max_date, limit=20, reblog_info=True, + notes_info=True, filter="raw", npf=True) posts = posts["posts"] except ConnectionRefusedError: @@ -691,23 +701,26 @@ def get_posts_by_blog(self, blog, post_id=None, max_date=None, min_date=None): self.failed_posts.append(post_id) self.dataset.update_status("ConnectionRefused: Unable to collect post %s/%s" % (blog, post_id)) else: - self.dataset.update_status("ConnectionRefused: Unable to collect posts for blog %s before %s" % (blog, max_date)) + self.dataset.update_status( + "ConnectionRefused: Unable to collect posts for blog %s before %s" % (blog, max_date)) time.sleep(10) continue except Exception as e: - self.dataset.update_status("Reached the limit of the Tumblr API. Last timestamp: %s" % str(max_date)) + self.dataset.update_status("Couldn't collect posts; likely reached the limit of the Tumblr API (%s)." + "Last timestamp: %s" % (e, str(max_date))) self.api_limit_reached = True break # Make sure the Tumblr API doesn't magically stop at an earlier date if not posts or isinstance(posts, str): retries += 1 - max_date -= 3600 # Decrease by an hour - self.dataset.update_status("No posts returned by Tumblr - checking whether this is really all (retry %s/48)" % str(retries)) + max_date -= 3600 # Decrease by an hour + self.dataset.update_status( + "No posts returned by Tumblr - checking whether this is really all (retry %s/48)" % str(retries)) continue - # Skip posts that we already enountered, + # Skip posts that we already encountered, # preventing Tumblr API shenanigans or double posts because of # time reductions. Make sure it's no error string, though. new_posts = [] @@ -726,7 +739,7 @@ def get_posts_by_blog(self, blog, post_id=None, max_date=None, min_date=None): # Possibly only keep posts within the date range. if max_date and min_date: new_posts = [p for p in new_posts if min_date <= p["timestamp"] <= max_date] - + if not new_posts: break @@ -749,16 +762,17 @@ def get_posts_by_blog(self, blog, post_id=None, max_date=None, min_date=None): return all_posts - def get_notes(self, blog_id, post_id, mode="conversation", max_reblogs=1000): + def get_notes(self, blog_id, post_id, mode="conversation", max_reblogs=1000) -> dict: """ Gets data on the notes of a specific post. - :param blog_id, str: The ID of the blog. - :param post_id, str: The ID of the post. - :param mode, str: The type of notes that get priority. - `conversation` prioritises text reblogs and replies. - :param mode, max_reblogs: Maximum amount of notes to return. - - :returns: a list with dictionaries of notes. + :param blog_id: The ID of the blog. + :param post_id: The ID of the post. + :param mode: The type of notes that get priority. + `conversation` prioritises text reblogs and replies. + :param mode: Maximum amount of notes to return. + :param max_reblogs: The number of reblogs to collect. + + :returns: a dictionaries with notes and note metrics. """ post_notes = [] @@ -780,6 +794,7 @@ def get_notes(self, blog_id, post_id, mode="conversation", max_reblogs=1000): stop_collecting = False # For status updates + note_type = "" if mode == "conversation": note_type = "reblogs with text" elif mode == "reblogs_with_tags": @@ -800,11 +815,12 @@ def get_notes(self, blog_id, post_id, mode="conversation", max_reblogs=1000): # Important: we're getting notes in 'conversation' mode to # prioritise replies and reblogs that add text. - # We're not interested in the the names of authors that liked the post + # We're not interested in the names of authors that liked the post # or who reblogged without adding content. notes = self.client.notes(blog_id, id=post_id, before_timestamp=max_date, mode=mode) except ConnectionRefusedError: - self.dataset.update_status("Couldn't get notes for post %s (ConnectionRefusedError), trying again" % post_id) + self.dataset.update_status( + "Couldn't get notes for post %s (ConnectionRefusedError), trying again" % post_id) notes_retries += 1 time.sleep(10) continue @@ -847,7 +863,7 @@ def get_notes(self, blog_id, post_id, mode="conversation", max_reblogs=1000): count += 1 post_notes.append(note) - + if count >= max_reblogs: post_notes = post_notes[:count + note_metrics.get("reply_count", 0)] stop_collecting = True @@ -857,7 +873,7 @@ def get_notes(self, blog_id, post_id, mode="conversation", max_reblogs=1000): if notes.get("_links"): max_date = notes["_links"]["next"]["query_params"]["before_timestamp"] - + self.dataset.update_status("Collected %s %s for @%s:%s" % (count, note_type, blog_id, post_id)) time.sleep(.2) @@ -894,9 +910,9 @@ def connect_to_tumblr(self): """ # User input keys config_keys = [self.parameters.get("consumer_key"), - self.parameters.get("consumer_secret"), - self.parameters.get("key"), - self.parameters.get("secret_key")] + self.parameters.get("consumer_secret"), + self.parameters.get("key"), + self.parameters.get("secret_key")] if not all(config_keys): # No user input keys; attempt to use 4CAT config keys config_keys = self.get_tumblr_keys(self.owner) @@ -927,12 +943,13 @@ def validate_query(query, request, user): :param User user: User object of user who has submitted the query :return dict: Safe query parameters """ + # no query 4 u if not query.get("query", "").strip(): raise QueryParametersException("You must provide a search query.") # reformat queries to be a comma-separated list - items = query.get("query").replace("#","") + items = query.get("query").replace("#", "") items = items.split("\n") # Not more than 10 plox @@ -964,12 +981,11 @@ def map_item(post): Parse Tumblr posts. Tumblr posts can be many different types, so some data processing is necessary. - :param posts, list: List of Tumblr posts as returned form the Tumblr API. + :param post: Tumblr post, as returned by the Tumblr API. - :return dict: Mapped item + :return dict: Mapped item """ - media_types = ["photo", "video", "audio"] image_urls = [] image_urls_reblogged = [] video_urls = [] @@ -983,19 +999,14 @@ def map_item(post): answers = "" raw_text = [] formatted_text = [] - authors_reblogged = [] - reblog_trail = [] body_reblogged = [] reblog_trail = [] body_ask = [] author_ask = "" authors_replied = [] - like_count = "" replies = [] unknown_blocks = [] - ordered_list_count = 1 - # Sometimes the content order is reshuffled in the `layout` property, # so we have to follow this. content_order = [] @@ -1018,7 +1029,7 @@ def map_item(post): # We're getting info as Neue Post Format types, # so we need to loop through and join some content 'blocks'. for i in content_order: - + block = post["content"][i] block_type = block["type"] @@ -1085,14 +1096,14 @@ def map_item(post): # This includes reblogged content, but it's not entirely complete (e.g. no tags) # so we'll only store the original blog name and its text + image content. for i, reblog in enumerate(post.get("trail", [])): - + reblogged_text = [] if "broken_blog_name" in reblog: reblog_author = reblog["broken_blog_name"] else: reblog_author = reblog["blog"]["name"] - + for reblog_block in reblog.get("content", []): if reblog_block["type"] == "text": reblogged_text.append(reblog_block["text"]) @@ -1102,7 +1113,7 @@ def map_item(post): if not reblogged_text: reblogged_text = "" body_reblogged.append("\n".join(reblogged_text)) - + reblog_trail.append(reblog_author) return MappedItem({ @@ -1112,7 +1123,7 @@ def map_item(post): "author_avatar_url": "https://api.tumblr.com/v2/blog/" + post["blog_name"] + "/avatar", "thread_id": post["reblog_key"], "timestamp": datetime.fromtimestamp(post["timestamp"]).strftime("%Y-%m-%d %H:%M:%S"), - "unix_timestamp": post["timestamp"], + "unix_timestamp": post["timestamp"], "author_subject": post["blog"]["title"], "author_description": strip_tags(post["blog"]["description"]), "author_url": post["blog"]["url"], @@ -1158,7 +1169,7 @@ def format_tumblr_text(text_content): Format text content according to Tumblr's Neue Post Format definition. Returns text as mardkown. - :param content, list: The list of content as returned by the Tumblr API (can also be part of a `trail`) + :param text_content: A list of `content` as returned by the Tumblr API (can also be part of a `trail`). :returns dict """ @@ -1180,7 +1191,7 @@ def format_tumblr_text(text_content): s = fmt["start"] e = fmt["end"] - opening = True # To know if styles need to be appended or prepended + opening = True # To know if styles need to be appended or prepended for n in [s, e]: insert_indexes.add(n) n = str(n) @@ -1203,9 +1214,10 @@ def format_tumblr_text(text_content): n = int(n) + extra_chars text = text[:n] + insert + text[n:] extra_chars += len(insert) - + # Some more 'subtype' formatting subtype = text_content.get("subtype") + ordered_list_count = 1 if subtype: if subtype == "unordered-list-item": text = "- " + text @@ -1238,4 +1250,5 @@ def after_process(self): errors.append("API error(s) when fetching reblogs %s" % ", ".join(self.failed_posts)) if errors: self.dataset.log(";\n ".join(errors)) - self.dataset.update_status(f"Dataset completed but failed to capture some notes/reblogs; see log for details") + self.dataset.update_status( + f"Dataset completed but failed to capture some notes/reblogs; see log for details") From 12b54b1416fb4999806e864dc8a8524f74900941 Mon Sep 17 00:00:00 2001 From: Sal Hagen Date: Mon, 19 Aug 2024 13:08:29 +0200 Subject: [PATCH 128/204] Add hash function to helpers --- common/lib/helpers.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/common/lib/helpers.py b/common/lib/helpers.py index 83e31fc4b..639d93df6 100644 --- a/common/lib/helpers.py +++ b/common/lib/helpers.py @@ -3,6 +3,7 @@ """ import subprocess import requests +import hashlib import datetime import smtplib import fnmatch @@ -897,3 +898,9 @@ def folder_size(path='.'): elif entry.is_dir(): total += folder_size(entry.path) return total + +def hash_values(string: str) -> str: + """ + Hash a string + """ + return hashlib.md5(string.encode("utf-8")).hexdigest() \ No newline at end of file From 2e6185c9f56a0dbddf2a7f1dededafea2f234a6c Mon Sep 17 00:00:00 2001 From: Sal Hagen Date: Mon, 19 Aug 2024 13:08:49 +0200 Subject: [PATCH 129/204] Revert test code in count posts processor --- processors/metrics/count_posts.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/processors/metrics/count_posts.py b/processors/metrics/count_posts.py index af32ed565..3ae077644 100644 --- a/processors/metrics/count_posts.py +++ b/processors/metrics/count_posts.py @@ -51,17 +51,12 @@ def process(self): first_interval = "9999" last_interval = "0000" - annotations = [] - self.dataset.update_status("Processing items") with self.dataset.get_results_path().open("w") as results: counter = 0 for post in self.source_dataset.iterate_items(self): - annotation = {"value": "test", "item_id": post["id"]} - annotations.append(annotation) - try: date = get_interval_descriptor(post, timeframe) except ValueError as e: @@ -152,7 +147,6 @@ def process(self): row["value_relative"] = intervals[interval]["relative"] rows.append(row) - self.write_annotations(annotations) self.write_csv_items_and_finish(rows) @classmethod From 4ac3e62a2d848959d73fcdc580cbf374f416d3ca Mon Sep 17 00:00:00 2001 From: Sal Hagen Date: Mon, 19 Aug 2024 13:09:17 +0200 Subject: [PATCH 130/204] Change parameter in Jinja2 template --- webtool/templates/components/result-result-row.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webtool/templates/components/result-result-row.html b/webtool/templates/components/result-result-row.html index 06bd59290..331eecdf8 100644 --- a/webtool/templates/components/result-result-row.html +++ b/webtool/templates/components/result-result-row.html @@ -46,7 +46,7 @@ {% endif %} {% if __user_config("privileges.can_use_explorer") and has_explorer %}
  • - + Explore & annotate From 50cae616ab4ccde8d7b1810a92750fdee8a42301 Mon Sep 17 00:00:00 2001 From: Sal Hagen Date: Mon, 19 Aug 2024 18:21:23 +0200 Subject: [PATCH 131/204] Don't initialise Annotation() twice --- common/lib/annotation.py | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/common/lib/annotation.py b/common/lib/annotation.py index aadee1205..c12c53eeb 100644 --- a/common/lib/annotation.py +++ b/common/lib/annotation.py @@ -5,8 +5,8 @@ import time import json -import hashlib +from common.lib.helpers import hash_values from common.lib.exceptions import AnnotationException class Annotation: @@ -23,7 +23,6 @@ class Annotation: data = None db = None - id = None # Unique ID for this annotation item_id = None # ID of the item for this annotation, e.g. post ID field_id = None # If of this type of annotation field for this dataset @@ -51,7 +50,7 @@ def __init__(self, data=None, id=None, db=None): required_fields = ["label", "item_id", "dataset"] # Must have an ID or data - if id is None and (data is None or not isinstance(data, dict)): + if (id is None and data is None) or (data is not None and not isinstance(data, dict)): raise AnnotationException("Annotation() requires either a valid `data` dictionary or ID.") if not db: @@ -69,7 +68,7 @@ def __init__(self, data=None, id=None, db=None): if data and "id" in data: id = data["id"] self.id = id # IDs correspond to unique serial numbers in the database. - current = self.db.fetchone("SELECT * FROM annotations WHERE id = %s" % (self.id)) + current = self.get_by_id(id) if not current: raise AnnotationException( "Annotation() requires a valid ID for an existing annotation, %s given" % id) @@ -86,9 +85,6 @@ def __init__(self, data=None, id=None, db=None): # If we were able to retrieve an annotation from the db, it already exists if current: - - #current["metadata"] = json.loads(current["metadata"]) - # Check if we have to overwrite old data with new data if data: for key, value in data.items(): @@ -105,7 +101,6 @@ def __init__(self, data=None, id=None, db=None): # If this is a new annotation, set all the properties. else: - # Keep track of when the annotation was made created_timestamp = int(time.time()) @@ -142,7 +137,6 @@ def __init__(self, data=None, id=None, db=None): v = bool(v) except ValueError as e: raise AnnotationException("Annotation fields are not of the right type (%s)" % e) - self.__setattr__(k, v) # Write to db if anything changed @@ -150,13 +144,13 @@ def __init__(self, data=None, id=None, db=None): self.timestamp = int(time.time()) self.write_to_db() - def get_by_id(id: int, db): + def get_by_id(self, id: int): """ Get annotation by ID :param str id: ID of annotation :param db: Database connection object - :return: Annotation object, or `None` for invalid annotation ID + :return: Annotation object, or an empty dict if the ID doesn't exist. """ try: @@ -164,18 +158,24 @@ def get_by_id(id: int, db): except ValueError: raise AnnotationException("Id '%s' is not valid" % id) - return Annotation(id=id, db=db) + data = self.db.fetchone("SELECT * FROM annotations WHERE id = %s" % (id)) + + if not data: + return {} + + data["metadata"] = json.loads(data["metadata"]) + return data def get_by_field(self, dataset_key: str, item_id: str, label: str) -> dict: """ - Get the annotation information via its dataset key, item ID, and label. + Get the annotation information via its dataset key, item ID, and field_id. This is always a unique combination. :param dataset_key: The key of the dataset this annotation was made for. :param item_id: The ID of the item this annotation was made for. :param label: The label of the annotation. - :return data: A dict with data of the retrieved annotation, or None if it doesn't exist. + :return data: A dict with data of the retrieved annotation, or an empty dict if it doesn't exist. """ data = self.db.fetchone("SELECT * FROM annotations WHERE dataset = %s AND item_id = %s AND label = %s", @@ -195,8 +195,8 @@ def set_field_id(self, dataset_key: str, label: str) -> str: :param label: The label of the dataset. """ - field_id = source_dataset.key + annotation["label"] - field_id = hashlib.md5(field_id.encode("utf-8")).hexdigest() + base_field_id = dataset_key + label + field_id = hash_values(base_field_id) self.field_id = field_id return self.field_id @@ -205,6 +205,7 @@ def write_to_db(self): Write an annotation to the database. """ db_data = self.data + db_data["timestamp"] = int(time.time()) m = db_data["metadata"] # To avoid circular reference error db_data["metadata"] = json.dumps(m) return self.db.upsert("annotations", data=db_data, constraints=["label", "dataset", "item_id"]) @@ -262,8 +263,8 @@ def update_annotations_via_fields(dataset_key, old_fields: dict, new_fields: dic if old_fields == new_fields: return 0 - fields_to_delete = set() # Delete all annotations with this field ID - fields_to_update = {} # Update values of annotations with this field ID + fields_to_delete = set() # Delete all annotations with this field ID + fields_to_update = {} # Update values of annotations with this field ID # Loop through the old annotation fields for old_field_id, old_field in old_fields.items(): @@ -418,7 +419,6 @@ def __setattr__(self, attr, value): if attr == "metadata": value = json.dumps(value) - self.timestamp = int(time.time()) self.db.update("annotations", where={"id": self.id}, data={attr: value}) self.data[attr] = value From cdbe6ed1cd07b09929264b6579bccd994c09a7e5 Mon Sep 17 00:00:00 2001 From: Sal Hagen Date: Mon, 19 Aug 2024 18:21:57 +0200 Subject: [PATCH 132/204] Clean up and revert some JS --- webtool/static/js/explorer.js | 117 ++++++++++++++++------------------ 1 file changed, 56 insertions(+), 61 deletions(-) diff --git a/webtool/static/js/explorer.js b/webtool/static/js/explorer.js index 6648c8ced..10c69423b 100644 --- a/webtool/static/js/explorer.js +++ b/webtool/static/js/explorer.js @@ -22,15 +22,14 @@ const annotations = { init: function() { - let edit_field_box = $("#edit-annotation-fields"); let editor = $("#annotation-fields-editor"); let editor_controls = $("#annotation-fields-editor-controls"); var edits_made = false; // Add a new annotation field when clicking the plus icon $("#new-annotation-field").on("click", function(){ - let annotations_div = annotations.getAnnotationsDiv(); - $(annotations_div).insertBefore(edit_field_box);}); + annotations.addAnnotationField(); + }); // Show and hide the annotations editor let toggle_fields = $("#toggle-annotation-fields") @@ -89,7 +88,7 @@ const annotations = { // Make enter apply the option fields editor_controls.on("keypress", "input", function(e){ - if (e.which == 13) { + if (e.which === 13) { annotations.applyAnnotationFields(); } }); @@ -121,7 +120,7 @@ const annotations = { // Save unsaved annotations upon changing a page. $('.page > a').click(function(){ - if (!$("#save-annotations").hassClass('disabled')) { + if (!$("#save-annotations").hasClass('disabled')) { annotations.saveAnnotations(); } }) @@ -146,11 +145,11 @@ const annotations = { let options = $(el).parent().parent().next(); let option_fields = options.find(".option-field"); - if (type == "text" || type == "textarea") { + if (type === "text" || type === "textarea") { option_fields.remove(); } - else if (type == "dropdown" || type == "checkbox") { - if (option_fields.length == 0) { + else if (type === "dropdown" || type === "checkbox") { + if (option_fields.length === 0) { options.append(annotations.getInputField); } } @@ -218,9 +217,9 @@ const annotations = { Returns an object with the set annotation fields. */ - var annotation_fields = {}; - var warning = ""; - var labels_added = [] + let annotation_fields = {}; + let warning = ""; + let labels_added = [] annotations.warnEditor(""); @@ -239,12 +238,12 @@ const annotations = { // Get the ID of the field, so we // can later check if it already exists. - let field_id = parseInt(this.id.split("-")[1]); + let field_id = this.id.split("-")[1]; // Make sure the inputs have a label if (!label.length > 0) { label_field.addClass("invalid"); - warning = "Input names can't be empty"; + warning = "Field labels can't be empty"; } // Make sure the names can't be duplicates else if (labels_added.includes(label)) { @@ -254,7 +253,7 @@ const annotations = { // We can't add field labels that are also existing column names else if (original_columns.includes(label)) { - warning = "Fields labels cannot be an existing column name"; + warning = "Field label " + label + " is already present as a dataset item, please rename."; label_field.addClass("invalid"); } @@ -264,7 +263,7 @@ const annotations = { // Keep track of the labels we've added labels_added.push(label) - if (type == "text" || type == "textarea") { + if (type === "text" || type === "textarea") { annotation_fields[field_id] = {"type": type, "label": label}; } // Add options for dropdowns and checkboxes @@ -390,8 +389,8 @@ const annotations = { // Applies the annotation fields to each post on this page. // First we collect the annotation information from the editor - var annotation_fields = annotations.parseAnnotationFields(e); - var fields_to_add = {}; + let annotation_fields = annotations.parseAnnotationFields(e); + let fields_to_add = {}; // Show an error message if the annotation fields were not valid. if (typeof annotation_fields == "string") { @@ -426,7 +425,7 @@ const annotations = { } }); - // Add input fields to every posts in the explorer. + // Add input fields to every post in the explorer. // We take the annotations of the first post to check // what's the current state and add them to every post after. let text_fields = ["textarea", "text"]; @@ -447,7 +446,7 @@ const annotations = { // Edit the labels if they have changed. label_span = $(class_id + " > .annotation-label"); label = label_span.first().text(); - if (label != input_label) { + if (label !== input_label) { label_span.each(function(){ $(this).text(input_label); }); @@ -460,7 +459,7 @@ const annotations = { // If the change is between a textbox and textarea, // change the input type and carry over the text. - if (input_type != old_input_type) { + if (input_type !== old_input_type) { if (text_fields.includes(input_type) && text_fields.includes(old_input_type)) { @@ -473,11 +472,11 @@ const annotations = { } // Replace the HTML element, insert old values, and change the type class - if (input_type == "text" && old_input_type == "textarea") { + if (input_type === "text" && old_input_type === "textarea") { $(this).parent().removeClass("textarea").addClass("text"); $(this).replaceWith($("").val(add_val)); } - else if (input_type == "textarea" && old_input_type == "text") { + else if (input_type === "textarea" && old_input_type === "text") { $(this).parent().removeClass("text").addClass("textarea"); $(this).replaceWith($("")); } @@ -662,26 +661,25 @@ const annotations = { // Save the annotation fields used for this dataset // to the datasets table. - if (annotation_fields.length < 1 || annotation_fields == undefined) { - annotation_fields = annotation_fields.parseAnnotationFields; + if (annotation_fields.length < 1) { + return; } // If there's annotation fields, we can enable/disable the buttons annotations.fieldsExist(); - var dataset_key = $("#dataset-key").text(); - var json_annotations = JSON.stringify(annotation_fields); + let dataset_key = $("#dataset-key").text(); // AJAX the annotation forms $.ajax({ url: getRelativeURL("explorer/save_annotation_fields/" + dataset_key), type: "POST", contentType: "application/json", - data: json_annotations, + data: JSON.stringify(annotation_fields), success: function (response) { // If the query is accepted by the server. - if (response == 'success') { + if (response === 'success') { $("#annotations-editor-container").hide(); $("#apply-annotation-fields").addClass("disabled"); } @@ -821,43 +819,39 @@ const annotations = { pa.animate({"height": 0}, 250); }, - getAnnotationsDiv: function(id){ - // Returns an input field element with a pseudo-random ID, if none is provided. - if (id == undefined || id == 0) { - id = annotations.randomInt(); - } - - // Returns an annotation div element with a pseudo-random ID - return `
    -
    - - -
    -
    -
    -
    - -
    -
    -
    `.replace("{{FIELD_ID}}", id); + addAnnotationField: function(){ + /* + Adds an annotation field input element; + these have no IDs yet, we'll add a hashed database-label string when saving. + */ + + let annotation_field = `
    +
    + + +
    +
    +
    +
    + +
    +
    `.replace("randomint", Math.floor(Math.random() * 100000000).toString()); + $(annotation_field).insertBefore($("#edit-annotation-fields")); }, getInputField: function(id){ - // Returns an input field element with a pseudo-random ID, if none is provided. - if (id == undefined || id == 0) { - id = annotations.randomInt(); + // Returns an option field element with a pseudo-random ID, if none is provided. + if (id === undefined || id === 0) { + id = Math.floor(Math.random() * 100000000).toString(); } return "
    "; }, - - randomInt: function(){ - return Math.floor(Math.random() * 100000000); - } }; const page_functions = { @@ -872,13 +866,14 @@ const page_functions = { // Reorder the dataset when the sort type is changed $(".sort-select").on("change", function(){ - + + // Get the column to sort on, an whether we should sort in reverse. let selected = $("#column-sort-select").find("option:selected").val(); let order = $("#column-sort-order").find("option:selected").val(); sort_order = "" - if (order == "reverse"){ + if (order === "reverse"){ sort_order = "&order=reverse" } From 851e067f0f1b25a464a00672b7fee09908d2d083 Mon Sep 17 00:00:00 2001 From: Sal Hagen Date: Mon, 19 Aug 2024 18:22:13 +0200 Subject: [PATCH 133/204] Separate annoatation field into a component --- .../templates/explorer/annotation-field.html | 40 ++++++++++++++++ .../explorer/annotations-editor.html | 47 ++----------------- 2 files changed, 45 insertions(+), 42 deletions(-) create mode 100644 webtool/templates/explorer/annotation-field.html diff --git a/webtool/templates/explorer/annotation-field.html b/webtool/templates/explorer/annotation-field.html new file mode 100644 index 000000000..1f944d366 --- /dev/null +++ b/webtool/templates/explorer/annotation-field.html @@ -0,0 +1,40 @@ +{% set annotation_type = annotation_field["type"] %} + +{% set label = annotation_field["label"] %} + +
    +
    + + +
    +
    +
    +
    + +
    +
    + +{% if annotation_type == "dropdown" or annotation_type == "checkbox" %} +
    +
    + {% for option in annotation_fields[field]["options"] %} + {% set option_id = option.keys() | first %} + {% set option_label = option.values() | first %} +
    + + +
    + {% endfor %} +
    + +
    +
    +
    +{% else %} +
    +{% endif %} \ No newline at end of file diff --git a/webtool/templates/explorer/annotations-editor.html b/webtool/templates/explorer/annotations-editor.html index cf356f542..bb75e6bff 100644 --- a/webtool/templates/explorer/annotations-editor.html +++ b/webtool/templates/explorer/annotations-editor.html @@ -27,49 +27,12 @@
  • {% if annotation_fields %} + {% for field in annotation_fields %} + {% set annotation_field = annotation_fields[field] %} + {% include "explorer/annotation-field.html" %} + {% endfor %} + {% endif %} - {% for field in annotation_fields %} - {% set annotation_type = annotation_fields[field]["type"] %} - {% set label = annotation_fields[field]["label"] %} - -
    -
    - - -
    -
    -
    -
    - -
    -
    - - {% if annotation_type == "dropdown" or annotation_type == "checkbox" %} -
    -
    - {% for option in annotation_fields[field]["options"] %} - {% set option_id = option.keys() | first %} - {% set option_label = option.values() | first %} -
    - - -
    - {% endfor %} -
    - -
    -
    -
    - {% else %} -
    - {% endif %} - {% endfor %} - {% endif %}
    New field From f0a97081d5aa70a2a70019b391a1b2bebe145ba7 Mon Sep 17 00:00:00 2001 From: Sal Hagen Date: Mon, 19 Aug 2024 18:50:45 +0200 Subject: [PATCH 134/204] Make processor and Explorer annotation features co-exist peacefully --- backend/lib/processor.py | 19 +++--- common/lib/dataset.py | 86 +++++++++++++++------------ webtool/views/views_explorer.py | 101 +++++++++++++++++++++----------- 3 files changed, 123 insertions(+), 83 deletions(-) diff --git a/backend/lib/processor.py b/backend/lib/processor.py index 3ee7704d1..47dbcf3c9 100644 --- a/backend/lib/processor.py +++ b/backend/lib/processor.py @@ -3,7 +3,6 @@ """ import re import traceback -import hashlib import zipfile import typing import shutil @@ -11,13 +10,14 @@ import abc import csv import os +import random from pathlib import Path, PurePath from backend.lib.worker import BasicWorker from common.lib.dataset import DataSet from common.lib.fourcat_module import FourcatModule -from common.lib.helpers import get_software_commit, remove_nuls, send_email +from common.lib.helpers import get_software_commit, remove_nuls, send_email, hash_values from common.lib.exceptions import (WorkerInterruptedException, ProcessorInterruptedException, ProcessorException, DataSetException, MapItemException) from common.config_manager import config, ConfigWrapper @@ -739,10 +739,9 @@ def write_annotations(self, annotations: list, source_dataset=None, overwrite=Fa if not source_dataset: source_dataset = self.source_dataset - # Create a field ID based on the + already_exists_error = False # Check if this dataset already has annotation fields - field_id = "" existing_labels = source_dataset.get_annotation_field_labels() # Set some values @@ -756,6 +755,10 @@ def write_annotations(self, annotations: list, source_dataset=None, overwrite=Fa if not overwrite and label in existing_labels: label += "-" + str(len([l for l in existing_labels if l.startswith(label)])) annotation["label"] = label + elif annotation.get("label") and not overwrite: + if annotation["label"] in existing_labels: + already_exists_error = annotation["label"] + break # Set the author to this processor's name if not annotation.get("author"): @@ -768,11 +771,9 @@ def write_annotations(self, annotations: list, source_dataset=None, overwrite=Fa annotation["metadata"] = {} annotation["metadata"]["processor-parameters"] = self.parameters - if not annotation.get("field_id"): - if not field_id: - field_id = source_dataset.key + annotation["label"] - field_id = hashlib.md5(field_id.encode("utf-8")).hexdigest() - annotation["field_id"] = field_id + if already_exists_error: + self.dataset.finish_with_error( + "Annotation label '%s' already exists for this dataset" % already_exists_error) annotations_saved = source_dataset.save_annotations(annotations, overwrite=overwrite) return annotations_saved diff --git a/common/lib/dataset.py b/common/lib/dataset.py index 56aeb5b5e..56f92601f 100644 --- a/common/lib/dataset.py +++ b/common/lib/dataset.py @@ -1,7 +1,6 @@ import collections import itertools import datetime -import hashlib import fnmatch import random import shutil @@ -14,7 +13,7 @@ from common.config_manager import config from common.lib.annotation import Annotation from common.lib.job import Job, JobNotFoundException -from common.lib.helpers import get_software_commit, NullAwareTextIOWrapper, convert_to_int +from common.lib.helpers import get_software_commit, NullAwareTextIOWrapper, convert_to_int, hash_values from common.lib.item_mapping import MappedItem, DatasetItem from common.lib.fourcat_module import FourcatModule from common.lib.exceptions import (ProcessorInterruptedException, DataSetException, DataSetNotFoundException, @@ -965,7 +964,7 @@ def get_key(self, query, parameters, parent="", time_offset=0): parent_key = str(parent) if parent else "" plain_key = repr(param_key) + str(query) + parent_key - hashed_key = hashlib.md5(plain_key.encode("utf-8")).hexdigest() + hashed_key = hash_values(plain_key) if self.db.fetchone("SELECT key FROM datasets WHERE key = %s", (hashed_key,)): # key exists, generate a new one @@ -1584,17 +1583,17 @@ def get_annotations(self, item_id=[]) -> list: Retrieves the annotations for this dataset. :param item_id: A list of item IDs to get the annotations from. - If empty, get all the annotations for this dataset. - May also be a string to get annotations from a specific item. + May also be a string or int to get a specific annotation. + If left empty, get all the annotations for this dataset. - return list: All annotations, each in their own dictionary. + return list: List of Annotation objects. """ annotations = [] # Get annotation IDs first if item_id: - # Get specific annotations if IDs are given + # Cast to string if isinstance(item_id, str) or isinstance(item_id, int): item_id = [item_id] item_id = [str(i) for i in item_id] @@ -1607,11 +1606,10 @@ def get_annotations(self, item_id=[]) -> list: if not ids: return [] - ids = [i["id"] for i in ids] - # Then get the annotations by ID + ids = [i["id"] for i in ids] for id in ids: - annotations.append(Annotation.get_by_id(id, self.db)) + annotations.append(Annotation(id=id, db=self.db)) return annotations @@ -1678,53 +1676,57 @@ def save_annotations(self, annotations: list, overwrite=True) -> int: count = 0 annotation_fields = self.get_annotation_fields() annotation_labels = self.get_annotation_field_labels() - known_field_ids = {} # Just so we don't have to hash every annotation without a field ID + + field_id = "" + salt = str(random.randrange(0, 1000000)) # Add some dataset data to annotations, if not present - for annotation in annotations: + for annotation_data in annotations: # Check if the required fields are present - if "item_id" not in annotation: + if "item_id" not in annotation_data: raise AnnotationException("Can't save annotations; annotation must have an `item_id` referencing " - "the item they annotated, got %s" % annotation) - if "label" not in annotation or not isinstance(annotation["label"], str): + "the item it annotated, got %s" % annotation_data) + if "label" not in annotation_data or not isinstance(annotation_data["label"], str): raise AnnotationException("Can't save annotations; annotation must have a `label` field, " "got %s" % annotation) - if not overwrite and annotation["label"] in annotation_labels: + if not overwrite and annotation_data["label"] in annotation_labels: raise AnnotationException("Can't save annotations; annotation field with label %s " - "already exists" % annotation["label"]) + "already exists" % annotation_data["label"]) # Set dataset key - if not annotation.get("dataset"): - annotation["dataset"] = self.key - - # If not present, add an ID for this annotation field, based on the dataset key and label - if "field_id" not in annotation: - field_id_str = annotation["label"] + annotation["dataset"] - # Check if we hashed this before - if field_id_str in known_field_ids: - field_id = known_field_ids[field_id_str] - else: - field_id = hashlib.md5(field_id_str.encode("utf-8")).hexdigest() - annotation["field_id"] = field_id + if not annotation_data.get("dataset"): + annotation_data["dataset"] = self.key # Set default author to this dataset owner # If this annotation is made by a processor, it will have the processor name - if not annotation.get("author"): - annotation["author"] = self.get_owners()[0] + if not annotation_data.get("author"): + annotation_data["author"] = self.get_owners()[0] + + # The field ID can already exists for the same dataset/key combo, + # if a previous label has been renamed. + # If we're not overwriting, create a new key with some salt. + if not overwrite: + if not field_id: + field_id = hash_values(annotation_data["dataset"] + annotation_data["label"] + salt) + if field_id in annotation_fields: + annotation_data["field_id"] = field_id + + # Create Annotation object, which also saves it to the database + # If this dataset/item ID/label combination already exists, this retrieves the + # existing data and updates it with new values. + annotation = Annotation(data=annotation_data, db=self.db) # Add data on the type of annotation field, if it is not saved to the datasets table yet. # For now this is just a simple dict with a field ID, type, label, and possible options. - if not annotation_fields or annotation["field_id"] not in annotation_fields: - annotation_fields[annotation["field_id"]] = { - "label": annotation["label"], - "type": annotation.get("type", "text") # Default to text + if not annotation_fields or annotation.field_id not in annotation_fields: + annotation_fields[annotation.field_id] = { + "label": annotation.label, + "type": annotation.type # Defaults to `text` } - if "options" in annotation: - annotation_fields[annotation["field_id"]]["options"] = annotation["options"] + if annotation.options: + annotation_fields[annotation.options] = annotation.options - # Create Annotation object, which also saves it to the database - Annotation(data=annotation, db=self.db) count += 1 # Save annotation fields if things changed @@ -1799,6 +1801,12 @@ def save_annotation_fields(self, new_fields: dict, add=False) -> int: if old_fields[field_id] != annotation_field: changes = True + # Check if fields are removed + if not add: + for field_id in old_fields.keys(): + if field_id not in new_fields: + changes = True + # If we're just adding fields, add them to the old fields. # If the field already exists, overwrite the old field. if add and old_fields: diff --git a/webtool/views/views_explorer.py b/webtool/views/views_explorer.py index 88a2b64d8..abc9c5075 100644 --- a/webtool/views/views_explorer.py +++ b/webtool/views/views_explorer.py @@ -3,14 +3,16 @@ format and lets users annotate the data. """ +import json + from pathlib import Path -from flask import request, render_template +from flask import request, render_template, jsonify from flask_login import login_required, current_user from webtool import app, db, openapi, limiter, config from webtool.lib.helpers import error, setting_required from common.lib.dataset import DataSet -from common.lib.helpers import convert_to_float +from common.lib.helpers import convert_to_float, hash_values from common.lib.exceptions import DataSetException from common.config_manager import ConfigWrapper @@ -90,7 +92,7 @@ def explorer_dataset(dataset_key: str, page=1): # We don't need to sort if we're showing the existing dataset order (default). # If we're sorting, we need to iterate over the entire dataset first. - if not sort or (sort == "dataset-order" and reverse == False): + if not sort or (sort == "dataset-order" and not reverse): for row in dataset.iterate_items(warn_unmappable=False): count += 1 @@ -139,32 +141,43 @@ def explorer_dataset(dataset_key: str, page=1): # Generate the HTML page return render_template("explorer/explorer.html", dataset=dataset, datasource=datasource, has_database=has_database, posts=posts, annotation_fields=annotation_fields, annotations=annotations, template=template, posts_css=posts_css, page=page, offset=offset, posts_per_page=posts_per_page, post_count=post_count, max_posts=max_posts, warning=warning) -@app.route("/explorer/save_annotation_fields/", methods=["POST"]) +@app.route("/explorer/save_annotation_fields/", methods=["POST"]) @api_ratelimit @login_required @setting_required("privileges.can_run_processors") @setting_required("privileges.can_use_explorer") @openapi.endpoint("explorer") -def explorer_save_annotation_fields(key: str) -> int: +def explorer_save_annotation_fields(dataset_key: str) -> str: """ Save the annotation fields of a dataset to the datasets table. - :param key: The dataset key. + :param dataset_key: The dataset key. :return-error 404: If the dataset ID does not exist. :return int: The number of annotation fields saved. """ # Get dataset. - if not key: + if not dataset_key: return error(404, error="No dataset key provided") try: - dataset = DataSet(key=key, db=db) + dataset = DataSet(key=dataset_key, db=db) except DataSetException: return error(404, error="Dataset not found.") # Save it! annotation_fields = request.get_json() + + # Field IDs are not immediately set in the front end. + # We're going to do this based on the hash of the + # dataset key and the input label (should be unique) + field_keys = list(annotation_fields.keys()) + for field_id in field_keys: + if "undefined" in field_id: + new_field_id = hash_values(dataset_key + annotation_fields[field_id]["label"]) + annotation_fields[new_field_id] = annotation_fields[field_id] + del annotation_fields[field_id] + dataset.save_annotation_fields(annotation_fields) return "success" @@ -196,19 +209,32 @@ def explorer_save_annotations(dataset_key: str): dataset.save_annotations(annotations, overwrite=True) return "success" -@app.route("/explorer/save_annotation/", methods=["POST"]) + +@app.route("/explorer/get_annotation_field", methods=["GET"]) @api_ratelimit @login_required @setting_required("privileges.can_run_processors") @setting_required("privileges.can_use_explorer") @openapi.endpoint("explorer") -def explorer_save_annotation(key="") -> int: +def get_annotation_field(): """ - todo: integrate + Returns an annotation field input div + + :return-error 406: If the list of subqueries could not be parsed. """ - return 0 + try: + annotation_field = json.loads(request.args.get("annotation_field")) + except (TypeError, json.decoder.JSONDecodeError): + return error(406, error="Unexpected format for annotation field.") + + html = render_template("explorer/annotation-field.html", annotation_field=annotation_field) + return jsonify({ + "status": "success", + "html": html} + ) + -def sort_and_iterate_items(dataset: DataSet, sort=None, reverse=False, **kwargs) -> dict: +def sort_and_iterate_items(dataset: DataSet, sort="", reverse=False, **kwargs) -> dict: """ Loop through both csv and NDJSON files. This is basically a wrapper function for `iterate_items()` with the @@ -217,9 +243,9 @@ def sort_and_iterate_items(dataset: DataSet, sort=None, reverse=False, **kwargs) This first iterates through the entire file (with a max limit) to determine an order. Then it yields items based on this order. - :param dataset, str: The dataset object. - :param sort_by, str: The item key that determines the sort order. - :param reverse, bool: Whether to sort by largest values first. + :param dataset: The dataset object. + :param sort: The item key that determines the sort order. + :param reverse: Whether to sort by largest values first. :returns dict: Yields iterated post """ @@ -228,7 +254,7 @@ def sort_and_iterate_items(dataset: DataSet, sort=None, reverse=False, **kwargs) sorted_posts = [] # Use reversed() if we're reading the dataset from back to front. - if sort == "dataset-order" and reverse == True: + if sort == "dataset-order" and reverse: for item in reversed(list(dataset.iterate_items(**kwargs))): sorted_posts.append(item) @@ -247,12 +273,33 @@ def sort_and_iterate_items(dataset: DataSet, sort=None, reverse=False, **kwargs) for post in sorted_posts: yield post + +def has_datasource_template(datasource: str) -> bool: + """ + Check if the data source has a data source-specific template. + This requires HTML and CSS files. + Custom HTML files should be placed in `webtool/templates/explorer/datasource-templates/.html`. + Custom CSS files should be placed in `webtool/static/css/explorer/.css`. + + :param datasource: Datasource name. + + :returns: bool, Whether the required files are present. + """ + css_exists = Path(config.get('PATH_ROOT'), "webtool/static/css/explorer/" + datasource + ".css").exists() + html_exists = Path(config.get('PATH_ROOT'), "webtool/templates/explorer/datasource-templates/" + datasource + ".html").exists() + + if css_exists and html_exists: + return True + return False + def get_database_posts(db, datasource, ids, board="", threads=False, limit=0, offset=0, order_by=["timestamp"]): """ todo: Integrate later Retrieve posts by ID from a database-accessible data source. """ + raise NotImplementedError + if not ids: return None @@ -271,24 +318,6 @@ def get_database_posts(db, datasource, ids, board="", threads=False, limit=0, of return posts -def has_datasource_template(datasource: str) -> bool: - """ - Check if the data source has a data source-specific template. - This requires HTML and CSS files. - Custom HTML files should be placed in `webtool/templates/explorer/datasource-templates/.html`. - Custom CSS files should be placed in `webtool/static/css/explorer/.css`. - - :param datasource: Datasource name. - - :returns: bool, Whether the required files are present. - """ - css_exists = Path(config.get('PATH_ROOT'), "webtool/static/css/explorer/" + datasource + ".css").exists() - html_exists = Path(config.get('PATH_ROOT'), "webtool/templates/explorer/datasource-templates/" + datasource + ".html").exists() - - if css_exists and html_exists: - return True - return False - @app.route('/results///explorer') @api_ratelimit @login_required @@ -306,6 +335,7 @@ def explorer_api_thread(datasource, thread_id): :return-error 404: If the thread ID does not exist for the given data source. """ + raise NotImplementedError if not datasource: return error(404, error="No datasource provided") @@ -346,6 +376,7 @@ def explorer_api_posts(datasource, post_ids): :return-error 404: If the thread ID does not exist for the given data source. """ + raise NotImplementedError if not datasource: return error(404, error="No datasource provided") From e78099f859bda42206289e0a2210210d6d3645ad Mon Sep 17 00:00:00 2001 From: Sal Hagen Date: Tue, 20 Aug 2024 11:31:57 +0200 Subject: [PATCH 135/204] Test annotation processor --- .../metrics/annotation_processor_test.py | 45 +++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 processors/metrics/annotation_processor_test.py diff --git a/processors/metrics/annotation_processor_test.py b/processors/metrics/annotation_processor_test.py new file mode 100644 index 000000000..86f6f3264 --- /dev/null +++ b/processors/metrics/annotation_processor_test.py @@ -0,0 +1,45 @@ +""" +Collapse post bodies into one long string +""" + +from common.lib.helpers import UserInput +from backend.lib.processor import BasicProcessor + + +class AnnotatePosts(BasicProcessor): + """ + Merge post body into one long string + """ + type = "annotate-posts" # job type ID + category = "Metrics" # category + title = "Annotation test" # title displayed in UI + description = "Ya know" # description displayed in UI + extension = "csv" # extension of result file, used internally and in UI + + options = { + "overwrite": { + "type": UserInput.OPTION_TOGGLE, + "default": False, + "help": "Overwrite existing annotations by this processor?" + }, + "field_label": { + "type": UserInput.OPTION_TEXT, + "default": "" + } + } + + def process(self): + import random + annotations = [] + with self.dataset.get_results_path().open("w") as results: + + for post in self.source_dataset.iterate_items(self): + + annotation = {"item_id": post["id"], + "label": self.parameters.get("field_label", ""), + "value": random.randrange(1, 1000000)} + + annotations.append(annotation) + + self.write_annotations(annotations, overwrite=self.parameters.get("overwrite", False)) + self.dataset.finish(1) \ No newline at end of file From 90a0eb0ddf755c433802b27ab24664d78c298244 Mon Sep 17 00:00:00 2001 From: sal-phd-desktop Date: Tue, 20 Aug 2024 12:55:10 +0200 Subject: [PATCH 136/204] Improve Tumblr search description --- datasources/tumblr/DESCRIPTION.md | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/datasources/tumblr/DESCRIPTION.md b/datasources/tumblr/DESCRIPTION.md index 8269204a1..5100cb47f 100644 --- a/datasources/tumblr/DESCRIPTION.md +++ b/datasources/tumblr/DESCRIPTION.md @@ -1,5 +1,5 @@ The Tumblr data is retrieved by interfacing with the [Tumblr API](https://api.tumblr.com). -It is only possible to get posts by tag or per blog, since the API does not allow keyword search. +It is only possible to get posts by tag, per blog, or by individual posts, since the API does not allow keyword search. ### Privacy Be aware that the data may contain personal information. It is thus recommended to pseudonymise the data. @@ -14,9 +14,8 @@ may request a rate limit increase via Tumblr. If no internal API key is set, you can insert your own. ### Date bugs -The [Tumblr API](https://api.tumblr.com) is volatile: when fetching sporadically used -tags, it may return zero posts, even though older posts *do* exist. Check the oldest post in -your dataset to see if it this is indeed the case and whether any odd time gaps exists. +The [Tumblr API](https://api.tumblr.com) is volatile: when fetching content, it may return zero posts, even though older posts *do* exist. Check the oldest post in +your dataset to see if this is indeed the case and whether any odd time gaps exist. 4CAT tries to mitigate this by decreasing the date parameter (before) with six hours and sending the query again. This often successfully returns older, un-fetched posts. If it didn't find new data after checking 24 days in the past, it checks for data up to six years From 288dc1af0d8a7225bd610fc434b2a46c1273c8d8 Mon Sep 17 00:00:00 2001 From: sal-phd-desktop Date: Tue, 20 Aug 2024 12:55:42 +0200 Subject: [PATCH 137/204] Convert timestamps to the client's local time zone in Explorer --- webtool/static/js/explorer.js | 6 ++++++ webtool/templates/explorer/post-annotations.html | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/webtool/static/js/explorer.js b/webtool/static/js/explorer.js index 10c69423b..992465afe 100644 --- a/webtool/static/js/explorer.js +++ b/webtool/static/js/explorer.js @@ -864,6 +864,12 @@ const page_functions = { document.querySelectorAll('.thread li').forEach(link => link.classList.remove('highlight')); })); + // Change timestamps to the client's timezone + document.querySelectorAll(".timestamp-to-convert").forEach(function(el){ + let local_date = new Date(parseInt(el.innerText) * 1000) + el.innerText = new Intl.DateTimeFormat("en-GB", {dateStyle: "medium", timeStyle: "medium"}).format(local_date) + }); + // Reorder the dataset when the sort type is changed $(".sort-select").on("change", function(){ diff --git a/webtool/templates/explorer/post-annotations.html b/webtool/templates/explorer/post-annotations.html index f439a3dd3..6112397c7 100644 --- a/webtool/templates/explorer/post-annotations.html +++ b/webtool/templates/explorer/post-annotations.html @@ -55,7 +55,7 @@
    "; - - // Add a text input for text fields - if (input_type === "text") { - el += ""; - } - else if (input_type === "textarea") { - el += ""; - } - - // Add a dropdown for dropdown fields - else if (input_type === "dropdown") { - - el += ""; - } - - // Add checkboxes for checkbox fields - else if (input_type === "checkbox") { - - el += "
    "; - let options = fields_to_add[add_field].options; - - for (let i in options) { - - for (let option_id in options[i]) { - - option_label = options[i][option_id]; - - el += ""; - } - } - el += "
    "; - } - el += "
    "; - $(".posts li").each(function(){ - let post_id = this.id.split("post-")[1]; - $(this).find(".post-annotations").append(el.replaceAll("{POST_ID}", post_id)); - }); - } - } - - // Remove annotation forms that are deleted - var valid_fields = []; - for (var f in annotation_fields) { - valid_fields.push("field-" + f); - } - var present_annotations = $(".post-annotations").first().find(".post-annotation") - present_annotations.each(function(){ - let present_id = $(this).attr("class").split(" ")[1]; - if (!valid_fields.includes(present_id)) { - $("." + present_id).remove(); - } - }); - - // Hide annotations if there's no fields leftover - var leftover_annotations = $(".post-annotations").first().find(".post-annotation"); - if (leftover_annotations.length < 1) { - annotations.hideAnnotations(); - $("#toggle-annotations").addClass("disabled"); + annotations.saveAnnotationFields(annotation_fields); + location.reload(); } - // Else we're showing 'em - else { - annotations.showAnnotations(); - $("#toggle-annotations").removeClass("disabled"); - } - - $("#apply-annotation-fields").html(" Apply") }, saveAnnotationFields: function (annotation_fields){ @@ -695,7 +465,7 @@ const annotations = { }); }, - saveAnnotations: function (e){ + saveAnnotations: function (){ // Write the annotations to the dataset and annotations table. // First we're going to collect the data for this page. @@ -705,7 +475,6 @@ const annotations = { $(".posts > li").each(function(){ - let vals_changed = false; let post_annotations = $(this).find(".post-annotations"); if (post_annotations.length > 0) { @@ -713,7 +482,7 @@ const annotations = { post_annotations.find(".post-annotation").each(function(){ // Extract annotation object from the element - let annotation = annotations.parseAnnotation(this); + let annotation = annotations.parseAnnotation($(this)); if (annotation) { anns.push(annotation); @@ -721,11 +490,13 @@ const annotations = { }); } }) - - $("#save-annotations").html(" Saving annotations") + + let save_annotations = $("#save-annotations"); + save_annotations.html(" Saving annotations") annotations.disableSaving(); let code = "" + $.ajax({ url: getRelativeURL("explorer/save_annotations/" + dataset_key), type: "POST", @@ -738,24 +509,24 @@ const annotations = { code = response annotations.enableSaving(); - $("#save-annotations").html(" Annotations saved"); - $("#save-annotations").addClass("disabled"); - old_annotation_fields = $("#annotation-field").each(); + save_annotations.html(" Annotations saved"); + save_annotations.addClass("disabled"); + //var old_annotation_fields = $("#annotation-field").each(); // alert(alert_message); } else { annotations.enableSaving(); - $("#save-annotations").html(" Save annotations"); - alert("Could't save annotations"); - $("#save-annotations").removeClass("disabled"); + save_annotations.html(" Save annotations"); + alert("Couldn't save annotations"); + save_annotations.removeClass("disabled"); console.log(response); } }, error: function (error) { annotations.enableSaving(); - $("#save-annotations").html(" Save annotations"); - $("#save-annotations").removeClass("disabled"); - //alert("Could't save annotations"); + save_annotations.html(" Save annotations"); + save_annotations.removeClass("disabled"); + //alert("Couldn't save annotations"); console.log(error) } }); @@ -804,8 +575,8 @@ const annotations = { ta.html(" Hide annotations"); // Bit convoluted, but necessary to have auto height let pa = $(".post-annotations"); - current_height = pa.height(); - auto_height = pa.css("height", "auto").height(); + let current_height = pa.height(); + let auto_height = pa.css("height", "auto").height(); pa.height(current_height).animate({"height": auto_height}, 250, function(){ pa.height("auto"); }); @@ -852,28 +623,38 @@ const annotations = { } return "
    "; }, + + markChanges: function(el) { + // Adds current changes to a post annotation so we can save these later. + // Currently includes the time of edits and the username of the annotator + let current_username = $("#current-username").html(); + let current_date = Date.now() / 1000; + let input_field = el.find(".post-annotation-input"); + input_field.addClass("edited"); + $(el).find(".annotation-author").html(current_username); + $(el).find(".epoch-timestamp-edited").html(current_date); + $(el).find(".timestamp-edited").html(getLocalTimeStr(current_date)); + } }; const page_functions = { init: function() { - document.querySelectorAll('.quote a').forEach(link => link.addEventListener('mouseover', function(e) { + document.querySelectorAll('.quote a').forEach(link => link.addEventListener('mouseover', function() { let post = 'post-' + this.getAttribute('href').split('-').pop(); document.querySelector('#' + post).classList.add('highlight'); })); - document.querySelectorAll('.quote a').forEach(link => link.addEventListener('mouseout', function(e) { + document.querySelectorAll('.quote a').forEach(link => link.addEventListener('mouseout', function() { document.querySelectorAll('.thread li').forEach(link => link.classList.remove('highlight')); })); // Change timestamps to the client's timezone document.querySelectorAll(".timestamp-to-convert").forEach(function(el){ - let local_date = new Date(parseInt(el.innerText) * 1000) - el.innerText = new Intl.DateTimeFormat("en-GB", {dateStyle: "medium", timeStyle: "medium"}).format(local_date) + el.innerText = getLocalTimeStr(el.innerText); }); // Reorder the dataset when the sort type is changed $(".sort-select").on("change", function(){ - // Get the column to sort on, an whether we should sort in reverse. let selected = $("#column-sort-select").find("option:selected").val(); let order = $("#column-sort-order").find("option:selected").val(); @@ -914,5 +695,10 @@ function getRelativeURL(endpoint) { return root + endpoint; } +function getLocalTimeStr(epoch_timestamp) { + let local_date = new Date(parseInt(epoch_timestamp) * 1000) + local_date = Intl.DateTimeFormat("en-GB", {dateStyle: "medium", timeStyle: "medium"}).format(local_date); + return local_date +} }); \ No newline at end of file From 908544be2bdb83e6bfc3a2db5d8bed1e870b4ce0 Mon Sep 17 00:00:00 2001 From: sal-phd-desktop Date: Tue, 20 Aug 2024 17:59:20 +0200 Subject: [PATCH 144/204] Style changes in Explorer --- .../templates/components/result-details.html | 4 ++-- webtool/templates/explorer/explorer.html | 3 +++ .../templates/explorer/post-annotations.html | 21 ++++++++++++++++--- webtool/views/views_explorer.py | 7 ++++--- 4 files changed, 27 insertions(+), 8 deletions(-) diff --git a/webtool/templates/components/result-details.html b/webtool/templates/components/result-details.html index 6c3a6a8f2..485b0279d 100644 --- a/webtool/templates/components/result-details.html +++ b/webtool/templates/components/result-details.html @@ -117,10 +117,10 @@

    {% set annotations = dataset.get_annotations() %} {% if annotations %} - {{ annotations|length|numberify }} item{% if annotations|length > 1 %}s{% endif %} annotated with fields + {{ annotations|length|numberify }} annotation{% if annotations|length > 1 %}s{% endif %} {% endif %} {% for annotation_field in annotation_fields.items() %} - {{ annotation_field[1].type }} {{ annotation_field[1].label }} + {{ annotation_field[1].label }} {% endfor %}

    diff --git a/webtool/templates/explorer/explorer.html b/webtool/templates/explorer/explorer.html index eb31de870..7301190f4 100644 --- a/webtool/templates/explorer/explorer.html +++ b/webtool/templates/explorer/explorer.html @@ -10,6 +10,7 @@ + + {% set pseudonymised = True if dataset.parameters and dataset.parameters.get('pseudonymise', False) %} diff --git a/webtool/templates/explorer/post-annotations.html b/webtool/templates/explorer/post-annotations.html index 6112397c7..34dc8d429 100644 --- a/webtool/templates/explorer/post-annotations.html +++ b/webtool/templates/explorer/post-annotations.html @@ -51,23 +51,38 @@ {% endif %} {# Tooltip with metadata on the annotation #} - {% if annotation.author or annotation.timestamp or annotation.metadata %} + {% if annotation.author or annotation.author_original or annotation.timestamp or annotation.metadata %} {% endif %} {% endif %} + + {# Store some invisible data here to we can retrieve in with JS #} + +
    {% endfor %} {% endif %} diff --git a/webtool/views/views_explorer.py b/webtool/views/views_explorer.py index abc9c5075..ab48ffd73 100644 --- a/webtool/views/views_explorer.py +++ b/webtool/views/views_explorer.py @@ -19,13 +19,14 @@ config = ConfigWrapper(config, user=current_user, request=request) api_ratelimit = limiter.shared_limit("45 per minute", scope="api") -@app.route('/results//explorer/', defaults={'page': 1}) -@app.route('/results//explorer/page/') + +@app.route("/results//explorer/", defaults={"page": 1, "show_annotations": False}) +@app.route("/results//explorer/page/") @api_ratelimit @login_required @setting_required("privileges.can_use_explorer") @openapi.endpoint("explorer") -def explorer_dataset(dataset_key: str, page=1): +def explorer_dataset(dataset_key: str, page=1, show_annotations=False): """ Show posts from a dataset From 5e77fe27e2accf6ec0fab4fc0356ae501b189398 Mon Sep 17 00:00:00 2001 From: sal-phd-desktop Date: Wed, 21 Aug 2024 17:58:04 +0200 Subject: [PATCH 145/204] Redesign annotation field input controls, make them sortable, plus some other small fixes --- webtool/static/css/dataset-page.css | 44 +++++-- webtool/static/js/explorer.js | 121 +++++++++--------- webtool/static/js/fourcat.js | 1 + .../templates/explorer/annotation-field.html | 40 ------ .../templates/explorer/annotation-fields.html | 0 .../explorer/annotations-editor.html | 80 ++++++------ webtool/views/views_explorer.py | 25 ---- 7 files changed, 140 insertions(+), 171 deletions(-) delete mode 100644 webtool/templates/explorer/annotation-field.html create mode 100644 webtool/templates/explorer/annotation-fields.html diff --git a/webtool/static/css/dataset-page.css b/webtool/static/css/dataset-page.css index bbd96c7bf..308fdfa78 100644 --- a/webtool/static/css/dataset-page.css +++ b/webtool/static/css/dataset-page.css @@ -697,25 +697,41 @@ body.image-preview { } /* Explorer view */ -#annotation-fields-editor { - height: 0; - overflow: hidden; +#annotation-fields-editor-controls { + } -#annotation-fields-editor-controls { - display: grid; - grid-template-columns: auto auto auto; +#annotation-fields-editor-controls li { + position: relative; + background: white; +} + +#annotation-fields-editor-controls li:not(:last-child) { + border-bottom: 1px dotted var(--contrast-dark); +} + +.annotation-field > span, .annotation-field > i, .annotation-field > select { + vertical-align: top; } -#annotation-fields-editor-controls>div { - border-bottom: 1px solid var(--contrast-bright); +.annotation-field > i { + padding-top: 10px; +} + +.option-fields { + display: inline-block; + max-width: 250px; +} + +.option-field { + display: inline-block; } #edit-annotation-fields #input-warning { color: var(--accent-error); } -/* Remove all styles for explorer posts */ +/* Remove all styles for Explorer posts */ /* these ought to be defined specifically, */ /* and 4CAT styles shouldn't interfere. */ #explorer-posts, #explorer-posts > ol li { @@ -743,6 +759,14 @@ body.image-preview { min-width: 140px; } -.annotation-field-label.invalid { +.annotation-field-label.invalid, .option-field > input.invalid { border: 1px solid red; +} + +#edit-annotation-fields { + padding: 0.5em +} + +.delete-input { + float: right; } \ No newline at end of file diff --git a/webtool/static/js/explorer.js b/webtool/static/js/explorer.js index 066266682..7df7ca9a2 100644 --- a/webtool/static/js/explorer.js +++ b/webtool/static/js/explorer.js @@ -74,11 +74,9 @@ const annotations = { // Delete an entire annotation input // We're in a grid of threes, so this involves three divs editor_controls.on("click", ".annotation-field > .delete-input", function(){ - let parent_div = $(this).parent().parent(); - parent_div.next().remove(); // Input type - parent_div.next().remove(); // Options - parent_div.remove(); // Label - }); + let parent_div = $(this).parent(); + parent_div.remove(); + }); // Make saving available when annotation fields are changed editor_controls.on("click", ".delete-option-field", function() { @@ -144,25 +142,20 @@ const annotations = { toggleField: function (el) { // Change the type of input fields when switching in the dropdown - let type = $(el).val(); - - let options = $(el).parent().parent().next(); - let option_fields = options.find(".option-field"); - + let options = $(el).parent().parent().find(".option-fields"); if (type === "text" || type === "textarea") { - option_fields.remove(); + options.remove(); } else if (type === "dropdown" || type === "checkbox") { - if (option_fields.length === 0) { + if (options.children().length === 0) { options.append(annotations.getInputField); } } }, addOptions: function (el){ - // Dynamically a new options for dropdowns and checkboxes - + // Dynamically a new options for dropdowns and checkboxes in the fields editor. // If text is added to a field, and there are // no empty fields available, add a new one. let no_empty_fields = true; @@ -232,25 +225,25 @@ const annotations = { // Parse information from the annotations editor. $(".annotation-field").each(function(){ - // To align the input form, we're in a grid of threes: - // label, input type, options. - // Navigate the DOM to get these elements: - let label_field = $(this).children(".annotation-field-label"); - let type_field = $(this).parent().next(); - let options_field = $(this).parent().next().next(); + let ann_field = $(this); + + let label_field = ann_field.find(".annotation-field-label"); + let type = ann_field.find(".annotation-field-type").val(); + let option_fields = ann_field.find(".option-fields"); let label = label_field.val().replace(/\s+/g, ' '); + let no_options_added = false // Get the ID of the field, so we // can later check if it already exists. - let field_id = this.id.split("-")[1]; + let field_id = ann_field.attr("id").split("-")[1]; // Make sure the inputs have a label if (!label.length > 0) { label_field.addClass("invalid"); warning = "Field labels can't be empty"; } - // Make sure the names can't be duplicates + // Make sure the labels can't be duplicates else if (labels_added.includes(label)) { warning = "Field labels must be unique"; label_field.addClass("invalid"); @@ -262,29 +255,27 @@ const annotations = { label_field.addClass("invalid"); } - // Set the types and values of the annotation - type = type_field.find(".annotation-field-type").val(); - // Keep track of the labels we've added - labels_added.push(label) - + labels_added.push(label); if (type === "text" || type === "textarea") { annotation_fields[field_id] = {"type": type, "label": label}; } // Add options for dropdowns and checkboxes - else { + else if (option_fields.length > 0) { let options = []; // List of dicts, because it needs to be ordered let option_labels = []; - let no_options_added = true; - options_field.find(".option-field > input").each(function(){ - let option_label = $(this).val(); - let option_id = $(this).id.replace("input-", ""); + no_options_added = true; + option_fields.find(".option-field").each(function(){ + let option_input = $(this).find("input"); + let option_label = option_input.val(); + let option_id = option_input.attr("id").replace("option-", ""); + // New option label if (!option_labels.includes(option_label) && option_label.length > 0) { // We're using a unique key for options as well. - option = {} + let option = {} option[option_id] = option_label options.push(option); option_labels.push(option_label); @@ -300,10 +291,9 @@ const annotations = { // But there must be at least one field in there. }); - if (no_options_added) { warning = "At least one field must be added"; - $(this).find(".option-field > input").first().addClass("invalid"); + ann_field.find(".option-fields .option-field input").first().addClass("invalid"); } if (Object.keys(options).length > 0) { @@ -322,13 +312,13 @@ const annotations = { parseAnnotation: function(el) { /* - Converts the DOM objects of an annotation field - to an annotation Object. + Converts the DOM objects of an annotation + to an annotation object. - Must be given a .post-annotation div element + Must be given a .post-annotation div element. */ - console.log(el) + let ann_input = el.find(".post-annotation-input"); let ann_classes = el.attr("class").split(" "); let ann_input_classes = ann_input.attr("class").split(" "); @@ -390,7 +380,6 @@ const annotations = { "by_processor": false, // Explorer annotations are human-made! "timestamp": timestamp } - console.log(annotation) return annotation }, @@ -399,7 +388,6 @@ const annotations = { // First we collect the annotation information from the editor let annotation_fields = annotations.parseAnnotationFields(e); - let fields_to_add = {}; // Show an error message if the annotation fields were not valid. if (typeof annotation_fields == "string") { @@ -596,24 +584,25 @@ const annotations = { these have no IDs yet, we'll add a hashed database-label string when saving. */ - let annotation_field = `
    -
    - - -
    -
    -
    -
    - -
    -
    `.replace("randomint", Math.floor(Math.random() * 100000000).toString()); - $(annotation_field).insertBefore($("#edit-annotation-fields")); + let annotation_field = ` +
  • + + + + + + + + + +
  • + `.replace("randomint", Math.floor(Math.random() * 100000000).toString()); + $("#annotation-field-settings").append(annotation_field); }, getInputField: function(id){ @@ -621,7 +610,7 @@ const annotations = { if (id === undefined || id === 0) { id = Math.floor(Math.random() * 100000000).toString(); } - return "
    "; + return ""; }, markChanges: function(el) { @@ -652,6 +641,18 @@ const page_functions = { el.innerText = getLocalTimeStr(el.innerText); }); + // Make annotation field editor sortable + $('#annotation-field-settings').sortable({ + cursor: "s-resize", + handle: ".handle", + items: "li", + axis: "y", + containment: "#annotation-field-settings", + change: function() { + $("#apply-annotation-fields").removeClass("disabled"); + } + }); + // Reorder the dataset when the sort type is changed $(".sort-select").on("change", function(){ diff --git a/webtool/static/js/fourcat.js b/webtool/static/js/fourcat.js index df56bca60..e36793c8b 100644 --- a/webtool/static/js/fourcat.js +++ b/webtool/static/js/fourcat.js @@ -1555,6 +1555,7 @@ const ui_helpers = { cursor: 'ns-resize', handle: '.handle', items: '.implicit, .explicit', + containment: '#tag-order', axis: 'y', update: function(e, ui) { let tag_order = Array.from(document.querySelectorAll('#tag-order li[data-tag]')).map(t => t.getAttribute('data-tag')).join(','); diff --git a/webtool/templates/explorer/annotation-field.html b/webtool/templates/explorer/annotation-field.html deleted file mode 100644 index 1f944d366..000000000 --- a/webtool/templates/explorer/annotation-field.html +++ /dev/null @@ -1,40 +0,0 @@ -{% set annotation_type = annotation_field["type"] %} - -{% set label = annotation_field["label"] %} - -
    -
    - - -
    -
    -
    -
    - -
    -
    - -{% if annotation_type == "dropdown" or annotation_type == "checkbox" %} -
    -
    - {% for option in annotation_fields[field]["options"] %} - {% set option_id = option.keys() | first %} - {% set option_label = option.values() | first %} -
    - - -
    - {% endfor %} -
    - -
    -
    -
    -{% else %} -
    -{% endif %} \ No newline at end of file diff --git a/webtool/templates/explorer/annotation-fields.html b/webtool/templates/explorer/annotation-fields.html new file mode 100644 index 000000000..e69de29bb diff --git a/webtool/templates/explorer/annotations-editor.html b/webtool/templates/explorer/annotations-editor.html index bb75e6bff..68486cbba 100644 --- a/webtool/templates/explorer/annotations-editor.html +++ b/webtool/templates/explorer/annotations-editor.html @@ -3,41 +3,49 @@
    -
    -
    - Label - - -
    -
    -
    -
    - Input type - - -
    -
    -
    -
    - Options - - -
    -
    - +
      {% if annotation_fields %} - {% for field in annotation_fields %} - {% set annotation_field = annotation_fields[field] %} - {% include "explorer/annotation-field.html" %} - {% endfor %} - {% endif %} + {% for field in annotation_fields %} + {% set annotation_field = annotation_fields[field] %} + {% set annotation_type = annotation_field["type"] %} + {% set label = annotation_field["label"] %} +
    1. + + + + + + + -
      -
      - New field - Apply -
      -
      -
      -
    \ No newline at end of file + + {% if annotation_type == "dropdown" or annotation_type == "checkbox" %} + {% for option in annotation_fields[field]["options"] %} + {% set option_id = option.keys() | first %} + {% set option_label = option.values() | first %} + + + + + {% endfor %} + + + + {% endif %} + + + + {% endfor %} + {% endif %} + + +
    diff --git a/webtool/views/views_explorer.py b/webtool/views/views_explorer.py index ab48ffd73..586d3a7e1 100644 --- a/webtool/views/views_explorer.py +++ b/webtool/views/views_explorer.py @@ -210,31 +210,6 @@ def explorer_save_annotations(dataset_key: str): dataset.save_annotations(annotations, overwrite=True) return "success" - -@app.route("/explorer/get_annotation_field", methods=["GET"]) -@api_ratelimit -@login_required -@setting_required("privileges.can_run_processors") -@setting_required("privileges.can_use_explorer") -@openapi.endpoint("explorer") -def get_annotation_field(): - """ - Returns an annotation field input div - - :return-error 406: If the list of subqueries could not be parsed. - """ - try: - annotation_field = json.loads(request.args.get("annotation_field")) - except (TypeError, json.decoder.JSONDecodeError): - return error(406, error="Unexpected format for annotation field.") - - html = render_template("explorer/annotation-field.html", annotation_field=annotation_field) - return jsonify({ - "status": "success", - "html": html} - ) - - def sort_and_iterate_items(dataset: DataSet, sort="", reverse=False, **kwargs) -> dict: """ Loop through both csv and NDJSON files. From af71c6c2f54892651c1cfb514051256a224f1fae Mon Sep 17 00:00:00 2001 From: Sal Hagen Date: Thu, 22 Aug 2024 17:58:21 +0200 Subject: [PATCH 146/204] Fix and simplify annotation field saving, re-enable saving options (and keep 'em ordered) --- common/lib/annotation.py | 6 +- common/lib/dataset.py | 10 +- webtool/static/css/dataset-page.css | 8 +- webtool/static/js/explorer.js | 112 +++++++----------- .../templates/explorer/annotation-fields.html | 0 .../explorer/annotations-editor.html | 8 +- webtool/templates/explorer/controls.html | 4 +- .../templates/explorer/post-annotations.html | 8 +- webtool/views/views_explorer.py | 15 ++- 9 files changed, 77 insertions(+), 94 deletions(-) delete mode 100644 webtool/templates/explorer/annotation-fields.html diff --git a/common/lib/annotation.py b/common/lib/annotation.py index 89aa95e56..153f1c6ba 100644 --- a/common/lib/annotation.py +++ b/common/lib/annotation.py @@ -311,6 +311,7 @@ def update_annotations_via_fields(dataset_key, old_fields: dict, new_fields: dic # Options are saved in a dict with IDs as keys and labels as values. for old_option_id, old_option in old_options.items(): + # Renamed option label if old_option_id in new_options and old_option != new_options[old_option_id]: options_to_update[old_option] = new_options[old_option_id] # Old label -> new label @@ -340,6 +341,9 @@ def update_annotations_via_fields(dataset_key, old_fields: dict, new_fields: dic # Write to db for column, update_value in updates.items(): + if column == "options": + update_value = json.dumps(update_value) + # Change values of columns updates = db.update("annotations", {column: update_value}, where={"dataset": dataset_key, "field_id": field_id}) @@ -350,7 +354,7 @@ def update_annotations_via_fields(dataset_key, old_fields: dict, new_fields: dic if column == "options": inserted_options = db.fetchall("SELECT id, value FROM annotations " - "WHERE dataset = %s and field_id = %s" % (dataset_key, field_id)) + "WHERE dataset = '%s' and field_id = '%s'" % (dataset_key, field_id)) new_inserts = [] for inserted_option in inserted_options: diff --git a/common/lib/dataset.py b/common/lib/dataset.py index 5dc1fa843..1780a1ced 100644 --- a/common/lib/dataset.py +++ b/common/lib/dataset.py @@ -1776,7 +1776,7 @@ def save_annotation_fields(self, new_fields: dict, add=False) -> int: # Get existing annotation fields to see if stuff changed. old_fields = self.get_annotation_fields() changes = False - + # Do some validation # Annotation field must be valid JSON. try: @@ -1784,7 +1784,12 @@ def save_annotation_fields(self, new_fields: dict, add=False) -> int: except ValueError: raise AnnotationException("Can't save annotation fields: not valid JSON (%s)" % new_fields) + # No duplicate IDs + if len(new_fields) != len(set(new_fields)): + raise AnnotationException("Can't save annotation fields: field IDs must be unique") + # Annotation fields must at minimum have `type` and `label` keys. + seen_labels = [] for field_id, annotation_field in new_fields.items(): if not isinstance(field_id, str): raise AnnotationException("Can't save annotation fields: field ID %s is not a valid string" % field_id) @@ -1792,6 +1797,9 @@ def save_annotation_fields(self, new_fields: dict, add=False) -> int: raise AnnotationException("Can't save annotation fields: all fields must have a label" % field_id) if "type" not in annotation_field: raise AnnotationException("Can't save annotation fields: all fields must have a type" % field_id) + if annotation_field["label"] in seen_labels: + raise AnnotationException("Can't save annotation fields: labels must be unique (%s)" % annotation_field["label"]) + seen_labels.append(annotation_field["label"]) # Keep track of whether existing fields have changed; if so, we're going to # update the annotations table. diff --git a/webtool/static/css/dataset-page.css b/webtool/static/css/dataset-page.css index 308fdfa78..61916e156 100644 --- a/webtool/static/css/dataset-page.css +++ b/webtool/static/css/dataset-page.css @@ -696,9 +696,13 @@ body.image-preview { cursor: zoom-out; } -/* Explorer view */ -#annotation-fields-editor-controls { +/* EXPLORER VIEW */ +#annotation-fields-editor { + height: 0; + overflow-y: hidden; +} +#annotation-fields-editor-controls { } #annotation-fields-editor-controls li { diff --git a/webtool/static/js/explorer.js b/webtool/static/js/explorer.js index 7df7ca9a2..41dbe26a6 100644 --- a/webtool/static/js/explorer.js +++ b/webtool/static/js/explorer.js @@ -53,21 +53,21 @@ const annotations = { // Keep track of when the annotation fields were edited. editor_controls.on("click", "#apply-annotation-fields, .delete-input, .delete-option-field", function() { - $("#apply-annotation-fields").removeClass("disabled"); + edits_made = true; + annotations.enableSaving(); }); editor_controls.on("change keydown", "input, select", function() { - $("#apply-annotation-fields").removeClass("disabled"); + edits_made = true; + annotations.enableSaving(); }); // Show and hide annotations $("#toggle-annotations").on("click", function(){ - if (!$(this).hasClass("disabled")) { - if ($(this).hasClass("shown")) { - annotations.hideAnnotations(); - } - else { - annotations.showAnnotations(); - } + if ($(this).hasClass("shown")) { + annotations.hideAnnotations(); + } + else { + annotations.showAnnotations(); } }); @@ -103,8 +103,8 @@ const annotations = { // Make saving available when annotations are changed let post_annotations = $(".post-annotations"); - post_annotations.on("keydown", "input, textarea", function() { annotations.enableSaving(); edits_made = true;}); - post_annotations.on("click", "option, input[type=checkbox], label", function() { annotations.enableSaving(); edits_made = true;}); + post_annotations.on("keydown", "input, textarea", function() { edits_made = true;}); + post_annotations.on("click", "option, input[type=checkbox], label", function() { edits_made = true;}); // Keep track of whether the annotations are edited or not. post_annotations.on("keydown change", @@ -116,25 +116,21 @@ const annotations = { // Save the annotations to the database $("#save-annotations").on("click", function(){ - if (!$(this).hasClass("disabled")) { - annotations.saveAnnotations(); - } + annotations.saveAnnotations(); }); // Save unsaved annotations upon changing a page. $('.page > a').click(function(){ - if (!$("#save-annotations").hasClass('disabled')) { - annotations.saveAnnotations(); - } + annotations.saveAnnotations(); }) // Check whether there's already fields saved for this dataset annotations.fieldsExist(); - // Save annotations every 10 seconds + // Save annotations every 10 seconds if changes have been made setInterval(function() { - if (!$("#save-annotations").hasClass("disabled") && edits_made) { - annotations.saveAnnotations(); + if (edits_made) { + //annotations.saveAnnotations(); } }, 10000); @@ -262,7 +258,7 @@ const annotations = { } // Add options for dropdowns and checkboxes else if (option_fields.length > 0) { - let options = []; // List of dicts, because it needs to be ordered + let options = new Map(); // Map, because it needs to be ordered let option_labels = []; no_options_added = true; @@ -271,13 +267,11 @@ const annotations = { let option_input = $(this).find("input"); let option_label = option_input.val(); let option_id = option_input.attr("id").replace("option-", ""); + // New option label if (!option_labels.includes(option_label) && option_label.length > 0) { - // We're using a unique key for options as well. - let option = {} - option[option_id] = option_label - options.push(option); + options.set(option_id, option_label); option_labels.push(option_label); no_options_added = false; } @@ -296,10 +290,10 @@ const annotations = { ann_field.find(".option-fields .option-field input").first().addClass("invalid"); } - if (Object.keys(options).length > 0) { + if (options.size > 0) { // Strip whitespace from the input field key label = label.replace(/\s+/g, ' '); - annotation_fields[field_id] = {"type": type, "label": label, "options": options}; + annotation_fields[field_id] = {"type": type, "label": label, "options": Object.fromEntries(options)}; } } }); @@ -411,7 +405,6 @@ const annotations = { // We store the annotation fields in the dataset table. annotations.saveAnnotationFields(annotation_fields); - location.reload(); } }, @@ -427,28 +420,25 @@ const annotations = { annotations.fieldsExist(); let dataset_key = $("#dataset-key").text(); - // AJAX the annotation forms $.ajax({ url: getRelativeURL("explorer/save_annotation_fields/" + dataset_key), type: "POST", contentType: "application/json", - data: JSON.stringify(annotation_fields), - + data: JSON.stringify(annotation_fields), success: function (response) { - // If the query is accepted by the server. - if (response === 'success') { - $("#annotations-editor-container").hide(); - $("#apply-annotation-fields").addClass("disabled"); - } + // If the query is accepted by the server... - // If the query is rejected by the server. - else { - annotations.warnEditor("Couldn't save annotation fields"); - } + //location.reload(); // ...simply reload the page to render the template again }, error: function (error) { - annotations.warnEditor(error); + if (error.status == 400) { + annotations.warnEditor(error.responseJSON.error); + } + else { + annotations.warnEditor("Server error, couldn't save annotation fields.") + } + $("#apply-annotation-fields").html(" Apply"); } }); }, @@ -481,7 +471,6 @@ const annotations = { let save_annotations = $("#save-annotations"); save_annotations.html(" Saving annotations") - annotations.disableSaving(); let code = "" @@ -494,27 +483,16 @@ const annotations = { success: function (response) { if (response === 'success') { - code = response - - annotations.enableSaving(); - save_annotations.html(" Annotations saved"); - save_annotations.addClass("disabled"); - //var old_annotation_fields = $("#annotation-field").each(); - // alert(alert_message); + code = response; } else { - annotations.enableSaving(); - save_annotations.html(" Save annotations"); alert("Couldn't save annotations"); - save_annotations.removeClass("disabled"); console.log(response); } + save_annotations.html(" Save annotations"); }, error: function (error) { - annotations.enableSaving(); save_annotations.html(" Save annotations"); - save_annotations.removeClass("disabled"); - //alert("Couldn't save annotations"); console.log(error) } }); @@ -524,21 +502,12 @@ const annotations = { // Annotation fields are sent by the server // and saved in a script in the header. // So we just need to check whether they're there. - - if (Object.keys(annotation_fields).length < 1) { - $("#toggle-annotations").addClass("disabled"); - return false; - } - else { - $("#toggle-annotations").removeClass("disabled"); - return true; - } + return Object.keys(annotation_fields).length >= 1; }, enableSaving: function(){ // Enable saving annotations to the database $("#save-annotations, #save-to-dataset").removeClass("disabled"); - $("#save-annotations").html(" Save annotations"); }, disableSaving: function(){ @@ -547,7 +516,7 @@ const annotations = { }, warnEditor: function(warning) { - + // Warns the annotation field editor if stuff's wrong let warn_field = $("#input-warning"); warn_field.html(warning); if (warn_field.hasClass("hidden")) { @@ -559,7 +528,6 @@ const annotations = { showAnnotations: function() { let ta = $("#toggle-annotations"); ta.addClass("shown"); - ta.removeClass("disabled"); ta.html(" Hide annotations"); // Bit convoluted, but necessary to have auto height let pa = $(".post-annotations"); @@ -585,7 +553,7 @@ const annotations = { */ let annotation_field = ` -
  • +
  • @@ -614,8 +582,8 @@ const annotations = { }, markChanges: function(el) { - // Adds current changes to a post annotation so we can save these later. - // Currently includes the time of edits and the username of the annotator + // Adds info on edits on post annotation to its element, so we can save these to the db later. + // Currently includes the time of edits and the username of the annotator. let current_username = $("#current-username").html(); let current_date = Date.now() / 1000; let input_field = el.find(".post-annotation-input"); @@ -641,7 +609,7 @@ const page_functions = { el.innerText = getLocalTimeStr(el.innerText); }); - // Make annotation field editor sortable + // Make annotation field editor sortable with jQuery UI. $('#annotation-field-settings').sortable({ cursor: "s-resize", handle: ".handle", @@ -649,7 +617,7 @@ const page_functions = { axis: "y", containment: "#annotation-field-settings", change: function() { - $("#apply-annotation-fields").removeClass("disabled"); + } }); diff --git a/webtool/templates/explorer/annotation-fields.html b/webtool/templates/explorer/annotation-fields.html deleted file mode 100644 index e69de29bb..000000000 diff --git a/webtool/templates/explorer/annotations-editor.html b/webtool/templates/explorer/annotations-editor.html index 68486cbba..b03d62be8 100644 --- a/webtool/templates/explorer/annotations-editor.html +++ b/webtool/templates/explorer/annotations-editor.html @@ -25,11 +25,9 @@ {% if annotation_type == "dropdown" or annotation_type == "checkbox" %} - {% for option in annotation_fields[field]["options"] %} - {% set option_id = option.keys() | first %} - {% set option_label = option.values() | first %} + {% for option_id, option_label in annotation_fields[field]["options"].items() %} - + {% endfor %} @@ -45,7 +43,7 @@
  • diff --git a/webtool/templates/explorer/controls.html b/webtool/templates/explorer/controls.html index 44d816b7f..98c8f01f1 100644 --- a/webtool/templates/explorer/controls.html +++ b/webtool/templates/explorer/controls.html @@ -23,8 +23,8 @@

  • Edit fields
  • -
  • Show annotations
  • -
  • {% if not annotations %}No annotations{% else %}Annotations saved{% endif %} +
  • Show annotations
  • +
  • Save annotations
    - {% for option in annotation_fields[field]["options"] %} - {% set option_id = option.keys() | first %} - {% set option_label = option.values() | first %} + {% for option_id, option_label in annotation_fields[field]["options"].items() %} {% set checked = "checked" if option_label in annotation.value else "" %} diff --git a/webtool/views/views_explorer.py b/webtool/views/views_explorer.py index 586d3a7e1..7a6323a4a 100644 --- a/webtool/views/views_explorer.py +++ b/webtool/views/views_explorer.py @@ -13,7 +13,7 @@ from webtool.lib.helpers import error, setting_required from common.lib.dataset import DataSet from common.lib.helpers import convert_to_float, hash_values -from common.lib.exceptions import DataSetException +from common.lib.exceptions import DataSetException, AnnotationException from common.config_manager import ConfigWrapper config = ConfigWrapper(config, user=current_user, request=request) @@ -148,7 +148,7 @@ def explorer_dataset(dataset_key: str, page=1, show_annotations=False): @setting_required("privileges.can_run_processors") @setting_required("privileges.can_use_explorer") @openapi.endpoint("explorer") -def explorer_save_annotation_fields(dataset_key: str) -> str: +def explorer_save_annotation_fields(dataset_key: str): """ Save the annotation fields of a dataset to the datasets table. @@ -174,14 +174,19 @@ def explorer_save_annotation_fields(dataset_key: str) -> str: # dataset key and the input label (should be unique) field_keys = list(annotation_fields.keys()) for field_id in field_keys: - if "undefined" in field_id: + if "tohash" in field_id: new_field_id = hash_values(dataset_key + annotation_fields[field_id]["label"]) annotation_fields[new_field_id] = annotation_fields[field_id] del annotation_fields[field_id] - dataset.save_annotation_fields(annotation_fields) + try: + fields_saved = dataset.save_annotation_fields(annotation_fields) + except AnnotationException as e: + # If anything went wrong with the annotation field saving, return an error. + return jsonify(error=str(e)), 400 - return "success" + # Else return the amount of fields saved. + return str(fields_saved) @app.route("/explorer/save_annotations/", methods=["POST"]) @api_ratelimit From a417283d399b173a56d2bb58821401c1a564484c Mon Sep 17 00:00:00 2001 From: Sal Hagen Date: Thu, 22 Aug 2024 20:52:03 +0200 Subject: [PATCH 147/204] Forgot a postgresql field in migrate script --- helper-scripts/migrate/migrate-1.45-1.46.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/helper-scripts/migrate/migrate-1.45-1.46.py b/helper-scripts/migrate/migrate-1.45-1.46.py index 719507098..2894c1cb6 100644 --- a/helper-scripts/migrate/migrate-1.45-1.46.py +++ b/helper-scripts/migrate/migrate-1.45-1.46.py @@ -105,7 +105,7 @@ count = 0 skipped_count = 0 - columns = "id,dataset,field_id,item_id,timestamp,timestamp_created,label,type,options,value,author,by_processor,metadata" + columns = "id,dataset,field_id,item_id,timestamp,timestamp_created,label,type,options,value,author,author_original,by_processor,metadata" # Each row are **all** annotations per dataset for row in annotations: @@ -167,6 +167,7 @@ json.dumps(options) if options else "", # options; each option has a key and a value. value, # value author, # author + author, # author_original False, # by_processor json.dumps({}), # metadata )] From 09f26dc1f91f363482f9f60fdefa5198d1ea8fc8 Mon Sep 17 00:00:00 2001 From: Sal Hagen Date: Thu, 22 Aug 2024 20:52:42 +0200 Subject: [PATCH 148/204] Revamp annotation saving from annotations made in Explorer --- common/lib/annotation.py | 2 +- webtool/static/js/explorer.js | 130 ++++++++---------- .../templates/explorer/post-annotations.html | 22 +-- webtool/views/views_explorer.py | 10 +- 4 files changed, 74 insertions(+), 90 deletions(-) diff --git a/common/lib/annotation.py b/common/lib/annotation.py index 153f1c6ba..147378c57 100644 --- a/common/lib/annotation.py +++ b/common/lib/annotation.py @@ -116,7 +116,7 @@ def __init__(self, data=None, id=None, db=None): "options": data.get("options", ""), "value": data.get("value", ""), "author": data.get("author", ""), - "author_original": data.get("author_original", ""), + "author_original": data.get("author", ""), "by_processor": data.get("by_processor", False), "metadata": data.get("metadata", {}), } diff --git a/webtool/static/js/explorer.js b/webtool/static/js/explorer.js index 41dbe26a6..2957d5366 100644 --- a/webtool/static/js/explorer.js +++ b/webtool/static/js/explorer.js @@ -100,18 +100,19 @@ const annotations = { annotations.addOptions(e.target); } }); - - // Make saving available when annotations are changed - let post_annotations = $(".post-annotations"); - post_annotations.on("keydown", "input, textarea", function() { edits_made = true;}); - post_annotations.on("click", "option, input[type=checkbox], label", function() { edits_made = true;}); // Keep track of whether the annotations are edited or not. - post_annotations.on("keydown change", - ".post-annotation-input, .post-annotation input, .post-annotation textarea", + let post_annotations = $(".post-annotations"); + post_annotations.on("keydown click change", + ".post-annotation-input, input[type=checkbox], label, option", function(){ - annotations.markChanges($(this).parent()); - + edits_made = true; + // Navigate one level up if it's a checkbox input + let parent = $(this).parent(); + if (parent.hasClass("checkboxes")) { + parent = parent.parent(); + } + annotations.markChanges(parent); }); // Save the annotations to the database @@ -310,71 +311,55 @@ const annotations = { to an annotation object. Must be given a .post-annotation div element. - */ let ann_input = el.find(".post-annotation-input"); let ann_classes = el.attr("class").split(" "); - let ann_input_classes = ann_input.attr("class").split(" "); - let field_id = ann_input_classes[1].replace("field-", ""); - let annotation_type = ann_classes[2].replace("type-", ""); + let ann_type = ann_classes[2].replace("type-", ""); + let field_id = ann_classes[1].replace("field-", ""); let item_id = ann_classes[3].replace("item-id-", ""); let label = el.find(".annotation-label").text(); let author = el.find(".annotation-author").html(); + let options = el.find(".annotation-options").html(); let timestamp = parseInt(el.find(".epoch-timestamp-edited").html()); let val = undefined; - let edited = false - - if (annotation_type === "text" || annotation_type === "textarea") { - val = ann_input.val(); - // It can be the case that the input text is deleted - // In this case we *do* want to push new data, so we check - // whether there's an 'edited' class present and save if so. - if (ann_input.hasClass("edited")) { - edited = true + + // If there are values inserted or things changed, return an annotation object. + // even if the value is an empty string. + if (el.hasClass("edited")) { + if (ann_type === "text" || ann_type === "textarea") { + val = ann_input.val(); + } else if (ann_type === "dropdown") { + val = ann_input.find(".post-annotation-options").val(); + } else if (ann_type === "checkbox") { + val = []; + el.find(".post-annotation-input").each(function () { + let checkbox = $(this); + if (checkbox.prop("checked") === true) { + val.push(checkbox.val()); + } + }); } - } - else if (annotation_type === "dropdown") { - val = ann_input.find(".post-annotation-options").val(); - } - else if (annotation_type === "checkbox") { - val = []; - ann_input.find(".post-annotation-options > input").each(function(){ - if (ann_input.is(":checked")) { - val.push(ann_input.val()); - } - if (ann_input.hasClass("edited")) { - edited = true - } - }); - if (!val.length > 0) { - val = undefined; + + // Create an annotation object and add them to the array. + let annotation = { + "field_id": field_id, + "item_id": item_id, + "label": label, + "type": ann_type, + "value": val, + "author": author, + "by_processor": false, // Explorer annotations are human-made! + "timestamp": timestamp, + "options": options, } + return annotation; } - - // if ((val !== undefined && val !== "") || edited) { - // vals_changed = true; - // val = ""; - // console.log("EDITED") - // } - // - // if (vals_changed){ - // annotation[post_id] = post_vals; - // } - - // Create an annotation object and add them to the array. - let annotation = { - "field_id": field_id, - "item_id": item_id, - "label": label, - "type": annotation_type, - "value": val, - "author": author, - "by_processor": false, // Explorer annotations are human-made! - "timestamp": timestamp + else { + // Return an empty object if nothing changed + return {}; } - return annotation }, applyAnnotationFields: function (e){ @@ -428,8 +413,7 @@ const annotations = { data: JSON.stringify(annotation_fields), success: function (response) { // If the query is accepted by the server... - - //location.reload(); // ...simply reload the page to render the template again + location.reload(); // ...simply reload the page to render the template again }, error: function (error) { if (error.status == 400) { @@ -461,8 +445,7 @@ const annotations = { // Extract annotation object from the element let annotation = annotations.parseAnnotation($(this)); - - if (annotation) { + if (Object.keys(annotation).length > 0 ) { anns.push(annotation); } }); @@ -472,8 +455,6 @@ const annotations = { let save_annotations = $("#save-annotations"); save_annotations.html(" Saving annotations") - let code = "" - $.ajax({ url: getRelativeURL("explorer/save_annotations/" + dataset_key), type: "POST", @@ -481,17 +462,15 @@ const annotations = { data: JSON.stringify(anns), success: function (response) { - - if (response === 'success') { - code = response; - } - else { - alert("Couldn't save annotations"); - console.log(response); - } save_annotations.html(" Save annotations"); }, error: function (error) { + if (error.status == 400) { + annotations.warnEditor(error.responseJSON.error); + } + else { + annotations.warnEditor("Server error, couldn't save annotation fields.") + } save_annotations.html(" Save annotations"); console.log(error) } @@ -586,8 +565,7 @@ const annotations = { // Currently includes the time of edits and the username of the annotator. let current_username = $("#current-username").html(); let current_date = Date.now() / 1000; - let input_field = el.find(".post-annotation-input"); - input_field.addClass("edited"); + $(el).addClass("edited"); $(el).find(".annotation-author").html(current_username); $(el).find(".epoch-timestamp-edited").html(current_date); $(el).find(".timestamp-edited").html(getLocalTimeStr(current_date)); diff --git a/webtool/templates/explorer/post-annotations.html b/webtool/templates/explorer/post-annotations.html index 842ac3a4a..75cc32c7b 100644 --- a/webtool/templates/explorer/post-annotations.html +++ b/webtool/templates/explorer/post-annotations.html @@ -22,13 +22,13 @@
    {% if type == 'text' %} - + {% elif type == 'textarea' %} - + {% elif type == 'dropdown' %} - {% for option_id, option_label in annotation_fields[field]["options"].items() %} @@ -37,10 +37,9 @@ {% elif type == 'checkbox' %} -
    +
    {% for option_id, option_label in annotation_fields[field]["options"].items() %} {% set checked = "checked" if option_label in annotation.value else "" %} - {% endfor %}
    @@ -69,15 +68,16 @@ {% endfor %} {% endif %} -

    {% endif %} +

    {% endif %} - {# Store some invisible data here to we can retrieve in with JS #} - + {# Store some invisible data here to we can retrieve in with JS #} +
    {% endfor %} diff --git a/webtool/views/views_explorer.py b/webtool/views/views_explorer.py index 7a6323a4a..9182d0c68 100644 --- a/webtool/views/views_explorer.py +++ b/webtool/views/views_explorer.py @@ -212,8 +212,14 @@ def explorer_save_annotations(dataset_key: str): except DataSetException: return error(404, error="Dataset not found.") - dataset.save_annotations(annotations, overwrite=True) - return "success" + try: + annotations_saved = dataset.save_annotations(annotations, overwrite=True) + except AnnotationException as e: + # If anything went wrong with the annotation field saving, return an error. + return jsonify(error=str(e)), 400 + + # Else return the amount of fields saved. + return str(annotations_saved) def sort_and_iterate_items(dataset: DataSet, sort="", reverse=False, **kwargs) -> dict: """ From 88b760901e023506debae4f33ee64d79e61f5eec Mon Sep 17 00:00:00 2001 From: sal-phd-desktop Date: Fri, 23 Aug 2024 17:54:23 +0200 Subject: [PATCH 149/204] Add saving notice and fix dropdown saving --- common/lib/annotation.py | 2 +- webtool/static/css/dataset-page.css | 17 ++ webtool/static/js/explorer.js | 194 +++++++++++++----- webtool/static/js/fourcat.js | 1 - webtool/templates/explorer/explorer.html | 4 +- .../templates/explorer/post-annotations.html | 8 +- 6 files changed, 163 insertions(+), 63 deletions(-) diff --git a/common/lib/annotation.py b/common/lib/annotation.py index 147378c57..3af037a6e 100644 --- a/common/lib/annotation.py +++ b/common/lib/annotation.py @@ -306,7 +306,7 @@ def update_annotations_via_fields(dataset_key, old_fields: dict, new_fields: dic fields_to_delete.add(field_id) continue - old_options = old_field["options"] + old_options = old_field.get("options", {}) options_to_update = {} # Options are saved in a dict with IDs as keys and labels as values. diff --git a/webtool/static/css/dataset-page.css b/webtool/static/css/dataset-page.css index 61916e156..47585d58f 100644 --- a/webtool/static/css/dataset-page.css +++ b/webtool/static/css/dataset-page.css @@ -771,6 +771,23 @@ body.image-preview { padding: 0.5em } +.post-annotations .property-badge { + font-size: 13px; +} + +#save-annotations-notice { + position: fixed; + background-color: var(--accent-okay); + color: var(--contrast-bright); + display: none; + right: 20px; + bottom: 64px; + width: 200px; + text-align: center; + padding: 10px 5px 10px 5px; + border-radius: 10px; +} + .delete-input { float: right; } \ No newline at end of file diff --git a/webtool/static/js/explorer.js b/webtool/static/js/explorer.js index 2957d5366..69195913a 100644 --- a/webtool/static/js/explorer.js +++ b/webtool/static/js/explorer.js @@ -5,6 +5,12 @@ $(init); /* * Page init */ + +// Global variable to keep track if annotations have been edited. +var edits_made = false; +// To check if we have to save annotations when exiting a page; not necessary for refresh. +var exit_page = true; + function init() { // Functional stuff @@ -20,11 +26,11 @@ function init() { */ const annotations = { + init: function() { let editor = $("#annotation-fields-editor"); let editor_controls = $("#annotation-fields-editor-controls"); - var edits_made = false; // Add a new annotation field when clicking the plus icon $("#new-annotation-field").on("click", function(){ @@ -107,9 +113,10 @@ const annotations = { ".post-annotation-input, input[type=checkbox], label, option", function(){ edits_made = true; - // Navigate one level up if it's a checkbox input + let parent = $(this).parent(); - if (parent.hasClass("checkboxes")) { + // Navigate one level up if it's a checkbox or dropdown input + if (parent.hasClass("post-annotation-options")) { parent = parent.parent(); } annotations.markChanges(parent); @@ -120,10 +127,12 @@ const annotations = { annotations.saveAnnotations(); }); - // Save unsaved annotations upon changing a page. - $('.page > a').click(function(){ - annotations.saveAnnotations(); - }) + // Save unsaved annotations upon leaving the page. + window.onbeforeunload = function(){ + if (exit_page) { + annotations.saveAnnotations(); + } + }; // Check whether there's already fields saved for this dataset annotations.fieldsExist(); @@ -131,7 +140,7 @@ const annotations = { // Save annotations every 10 seconds if changes have been made setInterval(function() { if (edits_made) { - //annotations.saveAnnotations(); + annotations.saveAnnotations(); } }, 10000); @@ -327,50 +336,47 @@ const annotations = { // If there are values inserted or things changed, return an annotation object. // even if the value is an empty string. - if (el.hasClass("edited")) { - if (ann_type === "text" || ann_type === "textarea") { - val = ann_input.val(); - } else if (ann_type === "dropdown") { - val = ann_input.find(".post-annotation-options").val(); - } else if (ann_type === "checkbox") { - val = []; - el.find(".post-annotation-input").each(function () { - let checkbox = $(this); - if (checkbox.prop("checked") === true) { - val.push(checkbox.val()); - } - }); - } - // Create an annotation object and add them to the array. - let annotation = { - "field_id": field_id, - "item_id": item_id, - "label": label, - "type": ann_type, - "value": val, - "author": author, - "by_processor": false, // Explorer annotations are human-made! - "timestamp": timestamp, - "options": options, - } - return annotation; + if (ann_type === "text" || ann_type === "textarea") { + val = ann_input.val(); + } else if (ann_type === "dropdown") { + val = $(ann_input).find(":selected").val(); + } else if (ann_type === "checkbox") { + val = []; + el.find(".post-annotation-input").each(function () { + let checkbox = $(this); + if (checkbox.prop("checked") === true) { + val.push(checkbox.val()); + } + }); } - else { - // Return an empty object if nothing changed - return {}; + + // Create an annotation object and add them to the array. + let annotation = { + "field_id": field_id, + "item_id": item_id, + "label": label, + "type": ann_type, + "value": val, + "author": author, + "by_processor": false, // Explorer annotations are human-made! + "timestamp": timestamp, + "options": options, } + //console.log(annotation) + return annotation; }, applyAnnotationFields: function (e){ // Applies the annotation fields to each post on this page. // First we collect the annotation information from the editor - let annotation_fields = annotations.parseAnnotationFields(e); + + let new_annotation_fields = annotations.parseAnnotationFields(e); // Show an error message if the annotation fields were not valid. - if (typeof annotation_fields == "string") { - annotations.warnEditor(annotation_fields); + if (typeof new_annotation_fields == "string") { + annotations.warnEditor(new_annotation_fields); } // If everything is ok, we're going to add @@ -389,33 +395,42 @@ const annotations = { }); // We store the annotation fields in the dataset table. - annotations.saveAnnotationFields(annotation_fields); + // First check if existing annotations are affected. + if (annotation_fields) { + annotations.checkFieldChanges(new_annotation_fields, annotation_fields); + } + else { + annotations.saveAnnotationFields(new_annotation_fields); + } } }, - saveAnnotationFields: function (annotation_fields){ + saveAnnotationFields: function (new_fields){ // Save the annotation fields used for this dataset // to the datasets table. + // `old fields` can be given to warn the user if changes to existing fields + // will affect annotations, like deleting a field or changing its type. + + let dataset_key = $("#dataset-key").text(); - if (annotation_fields.length < 1) { + if (new_fields.length < 1) { return; } - // If there's annotation fields, we can enable/disable the buttons - annotations.fieldsExist(); - - let dataset_key = $("#dataset-key").text(); // AJAX the annotation forms $.ajax({ url: getRelativeURL("explorer/save_annotation_fields/" + dataset_key), type: "POST", contentType: "application/json", - data: JSON.stringify(annotation_fields), - success: function (response) { + data: JSON.stringify(new_fields), + success: function () { // If the query is accepted by the server... + exit_page = false; location.reload(); // ...simply reload the page to render the template again }, error: function (error) { + console.log(error); + if (error.status == 400) { annotations.warnEditor(error.responseJSON.error); } @@ -427,6 +442,62 @@ const annotations = { }); }, + checkFieldChanges(new_fields, old_fields) { + + let deleted_fields = []; + let changed_type_fields = []; + + // Warn the user in case fields are deleted or changed from text to choice. + if (old_fields) { + let text_fields = ["text", "textarea"]; + let choice_fields = ["checkbox", "dropdown"]; + + for (let old_field_id in old_fields) { + + // Deleted + if (!(old_field_id in new_fields) || !new_fields) { + deleted_fields.push(old_fields[old_field_id]["label"]); + } else { + let old_type = old_fields[old_field_id]["type"]; + let new_type = new_fields[old_field_id]["type"] + if (old_type !== new_type) { + // Changed from text to choice, or the reverse. + // In this case annotations will be deleted. + // Changes from dropdown to checkbox also result in deleted annotations. + if ((text_fields.includes(old_type) && choice_fields.includes(new_type)) || + (choice_fields.includes(old_type) && text_fields.includes(new_type)) || + (choice_fields.includes(old_type) && choice_fields.includes(new_type))) { + changed_type_fields.push(new_type); + } + } + } + } + } + + // Ask 4 confirmation + if (deleted_fields.length > 0 || changed_type_fields.length > 0) { + let msg = ""; + if (deleted_fields.length > 0 && changed_type_fields.length > 0) { + msg = `Deleting fields and changing field types will also delete existing annotations that belonged to them. + Do you want to continue?`; + } + else if (changed_type_fields.length > 0) { + msg = `Changing field types will also delete existing annotations that belonged to them. + Do you want to continue?`; + } + else if (deleted_fields.length > 0) { + msg = `Deleting fields will also delete existing annotations that belonged to them. + Do you want to continue?`; + } + popup.confirm(msg, "Confirm", () => { + annotations.saveAnnotationFields(new_fields); + }); + } + else { + annotations.saveAnnotationFields(new_fields); + } + }, + saveAnnotations: function (){ // Write the annotations to the dataset and annotations table. @@ -443,10 +514,12 @@ const annotations = { post_annotations.find(".post-annotation").each(function(){ - // Extract annotation object from the element - let annotation = annotations.parseAnnotation($(this)); - if (Object.keys(annotation).length > 0 ) { - anns.push(annotation); + // Extract annotation object from edited elements + if ($(this).hasClass("edited")) { + let annotation = annotations.parseAnnotation($(this)); + if (Object.keys(annotation).length > 0 ) { + anns.push(annotation); + } } }); } @@ -463,16 +536,18 @@ const annotations = { success: function (response) { save_annotations.html(" Save annotations"); + annotations.notifySaved(); + edits_made = false; }, error: function (error) { + console.log(error) if (error.status == 400) { annotations.warnEditor(error.responseJSON.error); } else { - annotations.warnEditor("Server error, couldn't save annotation fields.") + annotations.warnEditor("Server error, couldn't save annotations.") } save_annotations.html(" Save annotations"); - console.log(error) } }); }, @@ -504,6 +579,13 @@ const annotations = { } }, + notifySaved: function() { + // Flash a fixed div with the notice that annotations are saved. + let notice = $("#save-annotations-notice"); + notice.fadeIn(400); + notice.delay(1750).fadeOut(1000); + }, + showAnnotations: function() { let ta = $("#toggle-annotations"); ta.addClass("shown"); diff --git a/webtool/static/js/fourcat.js b/webtool/static/js/fourcat.js index e36793c8b..7182ff1d5 100644 --- a/webtool/static/js/fourcat.js +++ b/webtool/static/js/fourcat.js @@ -1637,7 +1637,6 @@ const ui_helpers = { } }, - /** * Ask for confirmation before doing whatever happens when the event goes through * diff --git a/webtool/templates/explorer/explorer.html b/webtool/templates/explorer/explorer.html index 7301190f4..a19aa23c4 100644 --- a/webtool/templates/explorer/explorer.html +++ b/webtool/templates/explorer/explorer.html @@ -12,7 +12,7 @@