From 1d3f58730e6f49726ffd0a2747f54857e84c9fd5 Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Mon, 8 Apr 2024 12:46:31 +0200
Subject: [PATCH 001/204] First commit :)

---
 webtool/templates/explorer/header.html | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/webtool/templates/explorer/header.html b/webtool/templates/explorer/header.html
index f700a10a4..957090eac 100644
--- a/webtool/templates/explorer/header.html
+++ b/webtool/templates/explorer/header.html
@@ -4,7 +4,7 @@ <h1>
 	</span>
 		
 	<span>
-		4CAT Explorer (beta){% if parameters and parameters.get("label") %}  &bull; {{ parameters.get("label") }}{% elif thread %}  &bull; {{ thread }}{% endif %}
+		4CAT Explorer {% if parameters and parameters.get("label") %}  &bull; {{ parameters.get("label") }}{% elif thread %}  &bull; {{ thread }}{% endif %}
 	</span>
 </h1>
 <span id="dataset-key">{{ key }}</span>

From c4a46069393f71c4eac8e19c1b6f32610eab3388 Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Mon, 8 Apr 2024 17:09:14 +0200
Subject: [PATCH 002/204] Use regular `iterate_items` method when looping
 through dataset + minor changes

---
 webtool/__init__.py                           |  2 +-
 webtool/static/js/explorer.js                 |  4 +-
 webtool/templates/explorer/header.html        |  2 +-
 webtool/templates/explorer/nav-pages.html     |  2 +-
 .../{api_explorer.py => views_explorer.py}    | 77 ++++++++++---------
 5 files changed, 46 insertions(+), 41 deletions(-)
 rename webtool/views/{api_explorer.py => views_explorer.py} (91%)

diff --git a/webtool/__init__.py b/webtool/__init__.py
index 8a1e38a5b..766fc6509 100644
--- a/webtool/__init__.py
+++ b/webtool/__init__.py
@@ -106,8 +106,8 @@
 
 import webtool.views.views_dataset
 import webtool.views.views_misc
+import webtool.views.views_explorer
 
-import webtool.views.api_explorer
 import webtool.views.api_standalone
 import webtool.views.api_tool
 
diff --git a/webtool/static/js/explorer.js b/webtool/static/js/explorer.js
index c421cc001..1ee2acc9a 100644
--- a/webtool/static/js/explorer.js
+++ b/webtool/static/js/explorer.js
@@ -596,14 +596,14 @@ const annotations = {
 			data: json_annotations,
 
 			success: function (response) {
-				// If the query is rejected by the server.
+				// If the query is accepted by the server.
 				if (response == 'success') {
 					$("#annotations-editor-container").hide();
 					$("#save-annotation-fields").addClass("invalid")
 					$("#save-annotation-fields").prop("disabled", true);
 				}
 
-				// If the query is accepted by the server.
+				// If the query is rejected by the server.
 				else {
 					annotations.warnEditor("Couldn't save annotation fields");
 				}
diff --git a/webtool/templates/explorer/header.html b/webtool/templates/explorer/header.html
index 957090eac..ab3472fa7 100644
--- a/webtool/templates/explorer/header.html
+++ b/webtool/templates/explorer/header.html
@@ -15,7 +15,7 @@ <h1>
 				<p><strong>Large dataset - only showing the first {{ max_posts }} posts. Use filter processors to limit the dataset.</strong></p>
 				{% set post_count = max_posts %}
 			{% endif %}
-			<p>Showing posts {{ offset + 1 }} - {{ post_count if (offset + limit) > post_count else (offset + limit) }} ({{ post_count }} in total).</p>
+			<p>Showing posts {{ offset + 1 }} - {{ post_count if (offset + posts_per_page) > post_count else (offset + posts_per_page) }} ({{ post_count }} in total).</p>
 		{% if custom_fields and custom_fields[0] == "invalid" %}
 			<p><strong>Invalid custom fields JSON - can't show posts properly ({{ custom_fields[1] }}).</strong></p>
 		{% endif %}
diff --git a/webtool/templates/explorer/nav-pages.html b/webtool/templates/explorer/nav-pages.html
index 23fe84f8d..b212f7d65 100644
--- a/webtool/templates/explorer/nav-pages.html
+++ b/webtool/templates/explorer/nav-pages.html
@@ -2,7 +2,7 @@
 	{% if post_count > max_posts %}
 		{% set post_count = max_posts %}
 	{% endif %}
-	{% set pages = ((post_count / limit) + (post_count % limit > 0))|int %}
+	{% set pages = ((post_count / posts_per_page) + (post_count % posts_per_page > 0))|int %}
 	{% set selected = "selected" %}
 	{% set lower_bound = 3 %}
 	{% set upper_bound = pages - 2 %}
diff --git a/webtool/views/api_explorer.py b/webtool/views/views_explorer.py
similarity index 91%
rename from webtool/views/api_explorer.py
rename to webtool/views/views_explorer.py
index 5131c1ca8..d07ab5273 100644
--- a/webtool/views/api_explorer.py
+++ b/webtool/views/views_explorer.py
@@ -1,5 +1,6 @@
 """
-4CAT Data API - endpoints to get post and thread data from
+4CAT Explorer views - pages that display datasets akin to
+the 'native' appearance of the platform they were retrieved from.
 """
 
 import datetime
@@ -35,7 +36,7 @@
 @openapi.endpoint("explorer")
 def explorer_dataset(key, page):
 	"""
-	Show posts from a specific dataset
+	Show posts from a dataset
 
 	:param str dataset_key:  Dataset key
 
@@ -54,20 +55,20 @@ def explorer_dataset(key, page):
 		return error(403, error="This dataset is private.")
 
 	if len(dataset.get_genealogy()) > 1:
-		return error(404, error="Exporer only available for top-level datasets")
+		return error(404, error="Unavailable for top-level datasets")
 
 	results_path = dataset.check_dataset_finished()
 	if not results_path:
-		return error(404, error="This dataset didn't finish executing (yet)")
+		return error(404, error="This dataset didn't finish executing")
 
 	# The amount of posts to show on a page
-	limit = config.get("explorer.posts_per_page", 50)
+	posts_per_page = config.get("explorer.posts_per_page", 50)
 
 	# The amount of posts that may be included (limit for large datasets)
 	max_posts = config.get('explorer.max_posts', 500000)
 
 	# The offset for posts depending on the current page
-	offset = ((page - 1) * limit) if page else 0
+	offset = ((page - 1) * posts_per_page) if page else 0
 
 	# Load some variables
 	parameters = dataset.get_parameters()
@@ -83,7 +84,7 @@ def explorer_dataset(key, page):
 	if datasource in list(all_modules.datasources.keys()):
 		is_local = True if all_modules.datasources[datasource].get("is_local") else False
 
-	# Check if we have to sort the data in a specific way.
+	# Check if we have to sort the data.
 	sort_by = request.args.get("sort")
 	if sort_by == "dataset-order":
 		sort_by = None
@@ -107,27 +108,25 @@ def explorer_dataset(key, page):
 	posts = []
 	count = 0
 
-	first_post = False
-
-	for post in iterate_items(results_path, max_rows=max_posts, sort_by=sort_by, descending=descending, force_int=force_int):
+	try:
+		for row in dataset.iterate_items(warn_unmappable=False):
 
-		count += 1
+			count += 1
 
-		# Use an offset if we're showing a page beyond the first.
-		if count <= offset:
-			continue
+			# Use an offset if we're showing a page beyond the first.
+			if count <= offset:
+				continue
 
-		# Attribute column names and collect dataset's posts.
-		post_ids.append(post["id"])
-		posts.append(post)
+			# Attribute column names and collect dataset's posts.
+			post_ids.append(row["id"])
+			posts.append(row)
 
-		if "link_id" in post:
-			if post["link_id"][2] == "_":
-				post["link_id"] = post["link_id"][3:]
+			# Stop if we exceed the allowed posts per page or max. posts.
+			if count >= (offset + posts_per_page) or count > max_posts:
+				break
 
-		# Stop if we exceed the max posts per page.
-		if count >= (offset + limit) or count > max_posts:
-			break
+	except NotImplementedError:
+		return error(404)
 
 	# Include custom css if it exists in the datasource's 'explorer' dir.
 	# The file's naming format should e.g. be 'reddit-explorer.css'.
@@ -160,16 +159,17 @@ def explorer_dataset(key, page):
 		annotations = json.loads(annotations["annotations"])
 
 	# Generate the HTML page
-	return render_template("explorer/explorer.html", key=key, datasource=datasource, board=board, is_local=is_local, parameters=parameters, annotation_fields=annotation_fields, annotations=annotations, posts=posts, custom_css=css, custom_fields=custom_fields, page=page, offset=offset, limit=limit, post_count=post_count, max_posts=max_posts)
+	return render_template("explorer/explorer.html", key=key, datasource=datasource, board=board, is_local=is_local, parameters=parameters, annotation_fields=annotation_fields, annotations=annotations, posts=posts, custom_css=css, custom_fields=custom_fields, page=page, offset=offset, posts_per_page=posts_per_page, post_count=post_count, max_posts=max_posts)
 
 @app.route('/explorer/thread/<datasource>/<board>/<string:thread_id>')
 @api_ratelimit
 @login_required
 @setting_required("privileges.can_use_explorer")
 @openapi.endpoint("explorer")
-def explorer_thread(datasource, board, thread_id):
+def explorer_local_thread(datasource, board, thread_id):
 	"""
-	Show a thread in the explorer
+	Show a thread. This is only available for local data sources,
+	and will be depracated/changed in future updates.
 
 	:param str datasource:  Data source ID
 	:param str board:  Board name
@@ -191,7 +191,7 @@ def explorer_thread(datasource, board, thread_id):
 	max_posts = config.get('explorer.max_posts', 500000)
 
 	# Get the posts with this thread ID.
-	posts = get_posts(db, datasource, board=board, ids=tuple([thread_id]), threads=True, order_by=["id"])
+	posts = get_local_posts(db, datasource, board=board, ids=tuple([thread_id]), threads=True, order_by=["id"])
 
 	if not posts:
 		return error(404, error="No posts available for this thread")
@@ -207,16 +207,18 @@ def explorer_thread(datasource, board, thread_id):
 	# The file's naming format should e.g. be 'reddit-explorer.json'.
 	custom_fields = get_custom_fields(datasource)
 
-	return render_template("explorer/explorer.html", datasource=datasource, board=board, posts=posts, custom_css=css, custom_fields=custom_fields, limit=len(posts), post_count=len(posts), thread=thread_id, max_posts=max_posts)
+	return render_template("explorer/explorer.html", datasource=datasource, board=board, posts=posts, custom_css=css, custom_fields=custom_fields, posts_per_page=len(posts), post_count=len(posts), thread=thread_id, max_posts=max_posts)
 
 @app.route('/explorer/post/<datasource>/<board>/<string:post_id>')
 @api_ratelimit
 @login_required
 @setting_required("privileges.can_use_explorer")
 @openapi.endpoint("explorer")
-def explorer_post(datasource, board, thread_id):
+def explorer_local_posts(datasource, board, thread_id):
 	"""
-	Show a thread in the explorer
+	Show a posts from a local data source.
+	This is only available for local data sources,
+	and will be depracated/changed in future updates.
 
 	:param str datasource:  Data source ID
 	:param str board:  Board name
@@ -235,7 +237,7 @@ def explorer_post(datasource, board, thread_id):
 		return error(404, error="No thread ID provided")
 
 	# Get the posts with this thread ID.
-	posts = get_posts(db, datasource, board=board, ids=tuple([thread_id]), threads=True, order_by=["id"])
+	posts = get_local_posts(db, datasource, board=board, ids=tuple([thread_id]), threads=True, order_by=["id"])
 
 	posts = [strip_html(post) for post in posts]
 	posts = [format(post) for post in posts]
@@ -248,7 +250,7 @@ def explorer_post(datasource, board, thread_id):
 	# The file's naming format should e.g. be 'reddit-explorer.json'.
 	custom_fields = get_custom_fields(datasource)
 
-	return render_template("explorer/explorer.html", datasource=datasource, board=board, posts=posts, custom_css=css, custom_fields=custom_fields, limit=len(posts), post_count=len(posts))
+	return render_template("explorer/explorer.html", datasource=datasource, board=board, posts=posts, custom_css=css, custom_fields=custom_fields, posts_per_page=len(posts), post_count=len(posts))
 
 @app.route("/explorer/save_annotation_fields/<string:key>", methods=["POST"])
 @api_ratelimit
@@ -482,7 +484,7 @@ def get_boards(datasource):
 	Get available boards in datasource
 
 	:param datasource:  The datasource for which to acquire the list of available
-	                  boards.
+					  boards.
 	:return:  A list containing a list of `boards`, as string IDs.
 
 	:return-schema: {type=object,properties={
@@ -503,7 +505,7 @@ def get_boards(datasource):
 @app.route('/api/imagefile/<img_file>')
 @login_required
 @setting_required("privileges.can_use_explorer")
-def get_image_file(img_file, limit=0):
+def get_image_file(img_file):
 	"""
 	Returns an image based on filename
 	Request should hex the md5 hashes first (e.g. with hexdigest())
@@ -518,7 +520,7 @@ def get_image_file(img_file, limit=0):
 
 	return send_file(str(image_path))
 
-def iterate_items(in_file, max_rows=None, sort_by=None, descending=False, force_int=False):
+def iterate_items_with_sort(in_file, max_rows=None, sort_by=None, descending=False, force_int=False):
 	"""
 	Loop through both csv and NDJSON files.
 	:param in_file, str:		The input file to read.
@@ -582,7 +584,10 @@ def iterate_items(in_file, max_rows=None, sort_by=None, descending=False, force_
 
 	return Exception("Can't loop through file with extension %s" % suffix)
 
-def get_posts(db, datasource, ids, board="", threads=False, limit=0, offset=0, order_by=["timestamp"]):
+def get_local_posts(db, datasource, ids, board="", threads=False, limit=0, offset=0, order_by=["timestamp"]):
+	"""
+	Retrieve posts from a local data source based on post IDs.
+	"""
 
 	if not ids:
 		return None

From cac644e79b998590d3af8831a9fb32e671730173 Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Tue, 9 Apr 2024 15:10:14 +0200
Subject: [PATCH 003/204] Change wording in Explorer settings

---
 common/lib/config_definition.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/common/lib/config_definition.py b/common/lib/config_definition.py
index d1c5d6ea9..b1746bb78 100644
--- a/common/lib/config_definition.py
+++ b/common/lib/config_definition.py
@@ -102,8 +102,8 @@
     "privileges.can_use_explorer": {
         "type": UserInput.OPTION_TOGGLE,
         "default": True,
-        "help": "Can use explorer",
-        "tooltip": "Controls whether users can use the Explorer feature to navigate datasets."
+        "help": "Can use Explorer",
+        "tooltip": "Controls whether users can use the Explorer feature to analyse and annotate datasets."
     },
     "privileges.can_export_datasets": {
         "type": UserInput.OPTION_TOGGLE,
@@ -305,13 +305,12 @@
         "global": True
     },
     # Explorer settings
-    # The maximum allowed amount of rows (prevents timeouts and memory errors)
     "explorer.max_posts": {
         "type": UserInput.OPTION_TEXT,
         "default": 100000,
         "help": "Amount of posts",
         "coerce_type": int,
-        "tooltip": "Amount of posts to show in Explorer. The maximum allowed amount of rows (prevents timeouts and "
+        "tooltip": "Maximum number of posts to be considered by the Explorer (prevents timeouts and "
                    "memory errors)"
     },
     "explorer.posts_per_page": {
@@ -319,7 +318,7 @@
         "default": 50,
         "help": "Posts per page",
         "coerce_type": int,
-        "tooltip": "Posts to display per page"
+        "tooltip": "Number of posts to display per page"
     },
     # Web tool settings
     # These are used by the FlaskConfig class in config.py
@@ -515,7 +514,7 @@
     "4cat": "4CAT Tool settings",
     "api": "API credentials",
     "flask": "Flask settings",
-    "explorer": "Data Explorer",
+    "explorer": "Explorer",
     "datasources": "Data sources",
     "expire": "Dataset expiration settings",
     "mail": "Mail settings & credentials",

From 8b78452fdeb70398ada668e966fb047f8dd83de5 Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Tue, 9 Apr 2024 15:11:03 +0200
Subject: [PATCH 004/204] Allow Explorer CSS to be inserted and changed in
 Settings

---
 datasources/bitchute/search_bitchute.py       | 11 ++++
 datasources/douban/search_douban.py           | 11 ++++
 datasources/douyin/search_douyin.py           | 12 +++++
 datasources/eightchan/search_8chan.py         | 10 +++-
 datasources/eightkun/search_8kun.py           | 10 +++-
 datasources/fourchan/search_4chan.py          |  8 +++
 datasources/imgur/search_imgur.py             |  3 +-
 datasources/instagram/search_instagram.py     | 12 +++++
 datasources/linkedin/search_linkedin.py       | 12 +++++
 datasources/ninegag/search_9gag.py            |  1 +
 datasources/parler/search_parler.py           | 12 +++++
 datasources/reddit/search_reddit.py           | 10 +++-
 datasources/telegram/search_telegram.py       |  8 +++
 datasources/tiktok/search_tiktok.py           | 12 +++++
 datasources/tiktok_urls/search_tiktok_urls.py | 10 +++-
 .../tumblr/explorer/tumblr-explorer.css       |  6 ---
 datasources/tumblr/search_tumblr.py           | 12 ++++-
 datasources/twitter-import/search_twitter.py  | 12 +++++
 datasources/twitterv2/search_twitter.py       | 10 +++-
 datasources/vk/search_vk.py                   | 11 ++++
 webtool/views/views_explorer.py               | 50 ++++++-------------
 21 files changed, 193 insertions(+), 50 deletions(-)

diff --git a/datasources/bitchute/search_bitchute.py b/datasources/bitchute/search_bitchute.py
index c15540a50..37849891c 100644
--- a/datasources/bitchute/search_bitchute.py
+++ b/datasources/bitchute/search_bitchute.py
@@ -89,6 +89,17 @@ class SearchBitChute(Search):
 
     }
 
+    config =  {
+            "explorer.bitchute-search-explorer-css": {
+                "type": UserInput.OPTION_TEXT_LARGE,
+                "help": "Bitchute CSS",
+                "default": "",
+                "tooltip":  "Custom CSS for Bitchute posts in the the Explorer. This allows to "
+                            "mimic the original platform appearance. If empty, use the default "
+                            "CSS template (which is also editable on this page)."
+        }
+    }
+
     def get_items(self, query):
         """
         Run custom search
diff --git a/datasources/douban/search_douban.py b/datasources/douban/search_douban.py
index 0fb983fbe..841bb6037 100644
--- a/datasources/douban/search_douban.py
+++ b/datasources/douban/search_douban.py
@@ -75,6 +75,17 @@ class SearchDouban(Search):
         }
     }
 
+    config = {
+        "explorer.douban-search-explorer-css": {
+                "type": UserInput.OPTION_TEXT_LARGE,
+                "help": "TCAT CSS",
+                "default": "",
+                "tooltip":  "Custom CSS for Douban posts in the the Explorer. This allows to "
+                            "mimic the original platform appearance. If empty, use the default "
+                            "CSS template (which is also editable on this page)."
+        }
+    }
+
     def get_items(self, query):
         """
         Get Douban posts
diff --git a/datasources/douyin/search_douyin.py b/datasources/douyin/search_douyin.py
index bcad19bfb..b53aab2a4 100644
--- a/datasources/douyin/search_douyin.py
+++ b/datasources/douyin/search_douyin.py
@@ -8,6 +8,7 @@
 
 from backend.lib.search import Search
 from common.lib.item_mapping import MappedItem
+from common.lib.helpers import UserInput
 
 class SearchDouyin(Search):
     """
@@ -27,6 +28,17 @@ class SearchDouyin(Search):
         "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)"
     ]
 
+    config = {
+            "explorer.douyin-search-explorer-css": {
+                "type": UserInput.OPTION_TEXT_LARGE,
+                "help": "Douyin CSS",
+                "default": "",
+                "tooltip":  "Custom CSS for Douyin posts in the the Explorer. This allows to "
+                            "mimic the original platform appearance. If empty, use the default "
+                            "CSS template (which is also editable on this page)."
+            }
+        }
+
     def get_items(self, query):
         """
         Run custom search
diff --git a/datasources/eightchan/search_8chan.py b/datasources/eightchan/search_8chan.py
index b3d6702b8..fb8970808 100644
--- a/datasources/eightchan/search_8chan.py
+++ b/datasources/eightchan/search_8chan.py
@@ -108,5 +108,13 @@ class Search8Chan(Search4Chan):
 			"tooltip": "These boards will not be scraped, but can still be indexed if added to 'Boards to index'",
 			"default": [],
 			"global": True
-		}
+		},
+		"explorer.eightchan-explorer-css": {
+                "type": UserInput.OPTION_TEXT_LARGE,
+                "help": "8chan CSS",
+                "default": "",
+                "tooltip":  "Custom CSS for 8chan posts in the the Explorer. This allows to "
+                            "mimic the original platform appearance. If empty, use the default "
+                            "CSS template (which is also editable on this page)."
+            }
 	}
diff --git a/datasources/eightkun/search_8kun.py b/datasources/eightkun/search_8kun.py
index e54e69d3f..647434f6f 100644
--- a/datasources/eightkun/search_8kun.py
+++ b/datasources/eightkun/search_8kun.py
@@ -111,5 +111,13 @@ class Search8Kun(Search4Chan):
 			"tooltip": "These boards will not be scraped, but can still be indexed if added to 'Boards to index'",
 			"default": [],
 			"global": True
-		}
+		},
+		"explorer.eightkun-explorer-css": {
+                "type": UserInput.OPTION_TEXT_LARGE,
+                "help": "8kun CSS",
+                "default": "",
+                "tooltip":  "Custom CSS for 8kun posts in the the Explorer. This allows to "
+                            "mimic the original platform appearance. If empty, use the default "
+                            "CSS template (which is also editable on this page)."
+            }
 	}
\ No newline at end of file
diff --git a/datasources/fourchan/search_4chan.py b/datasources/fourchan/search_4chan.py
index 17694badc..7e8638caf 100644
--- a/datasources/fourchan/search_4chan.py
+++ b/datasources/fourchan/search_4chan.py
@@ -443,6 +443,14 @@ class Search4Chan(SearchWithScope):
 			"default": False,
 			"tooltip": "Allows users to query the 4chan data without specifying a keyword. This can lead to HUGE datasets!"
 		},
+		"explorer.fourchan-explorer-css": {
+                "type": UserInput.OPTION_TEXT_LARGE,
+                "help": "4chan CSS",
+                "default": "",
+                "tooltip":  "Custom CSS for 4chan posts in the the Explorer. This allows to "
+                            "mimic the original platform appearance. If empty, use the default "
+                            "CSS template (which is also editable on this page)."
+            }
 	}
 
 	def get_items_simple(self, query):
diff --git a/datasources/imgur/search_imgur.py b/datasources/imgur/search_imgur.py
index d3e55c38d..72b04369a 100644
--- a/datasources/imgur/search_imgur.py
+++ b/datasources/imgur/search_imgur.py
@@ -8,8 +8,9 @@
 
 from backend.lib.search import Search
 from common.lib.item_mapping import MappedItem
+from common.lib.helpers import UserInput
 
-class SearchNineGag(Search):
+class SearchImgur(Search):
     """
     Import scraped Imgur data
     """
diff --git a/datasources/instagram/search_instagram.py b/datasources/instagram/search_instagram.py
index fff5a01c8..0118aef36 100644
--- a/datasources/instagram/search_instagram.py
+++ b/datasources/instagram/search_instagram.py
@@ -10,6 +10,7 @@
 from backend.lib.search import Search
 from common.lib.item_mapping import MappedItem, MissingMappedField
 from common.lib.exceptions import WorkerInterruptedException, MapItemException
+from common.lib.helpers import UserInput
 
 
 class SearchInstagram(Search):
@@ -30,6 +31,17 @@ class SearchInstagram(Search):
         "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok) (also covers usage with Instagram)"
     ]
 
+    config = {
+        "explorer.instagram-search-explorer-css": {
+                "type": UserInput.OPTION_TEXT_LARGE,
+                "help": "Instagram CSS",
+                "default": "",
+                "tooltip":  "Custom CSS for Instagram posts in the the Explorer. This allows to "
+                            "mimic the original platform appearance. If empty, use the default "
+                            "CSS template (which is also editable on this page)."
+            }
+    }
+
     # some magic numbers instagram uses
     MEDIA_TYPE_PHOTO = 1
     MEDIA_TYPE_VIDEO = 2
diff --git a/datasources/linkedin/search_linkedin.py b/datasources/linkedin/search_linkedin.py
index d8c0df453..99c2e8efb 100644
--- a/datasources/linkedin/search_linkedin.py
+++ b/datasources/linkedin/search_linkedin.py
@@ -11,6 +11,7 @@
 
 from backend.lib.search import Search
 from common.lib.item_mapping import MappedItem
+from common.lib.helpers import UserInput
 
 class SearchLinkedIn(Search):
     """
@@ -30,6 +31,17 @@ class SearchLinkedIn(Search):
         "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok) (also explains general usage of Zeeschuimer)"
     ]
 
+    config = {
+        "explorer.linkedin-search-explorer-css": {
+                "type": UserInput.OPTION_TEXT_LARGE,
+                "help": "LinkedIn CSS",
+                "default": "",
+                "tooltip":  "Custom CSS for LinkedIn posts in the the Explorer. This allows to "
+                            "mimic the original platform appearance. If empty, use the default "
+                            "CSS template (which is also editable on this page)."
+            }
+    }
+
     def get_items(self, query):
         """
         Run custom search
diff --git a/datasources/ninegag/search_9gag.py b/datasources/ninegag/search_9gag.py
index 973de82ba..4d3768361 100644
--- a/datasources/ninegag/search_9gag.py
+++ b/datasources/ninegag/search_9gag.py
@@ -8,6 +8,7 @@
 
 from backend.lib.search import Search
 from common.lib.item_mapping import MappedItem
+from common.lib.helpers import UserInput
 
 
 class SearchNineGag(Search):
diff --git a/datasources/parler/search_parler.py b/datasources/parler/search_parler.py
index 8ccc7ccd8..3ceb95b3a 100644
--- a/datasources/parler/search_parler.py
+++ b/datasources/parler/search_parler.py
@@ -10,6 +10,7 @@
 
 from backend.lib.search import Search
 from common.lib.item_mapping import MappedItem
+from common.lib.helpers import UserInput
 
 
 class SearchParler(Search):
@@ -26,6 +27,17 @@ class SearchParler(Search):
     # not available as a processor for existing datasets
     accepts = [None]
 
+    config = {
+        "explorer.parler-search-explorer-css": {
+                "type": UserInput.OPTION_TEXT_LARGE,
+                "help": "Parler CSS",
+                "default": "",
+                "tooltip":  "Custom CSS for Parler posts in the the Explorer. This allows to "
+                            "mimic the original platform appearance. If empty, use the default "
+                            "CSS template (which is also editable on this page)."
+            }
+    }
+
     def get_items(self, query):
         """
         Run custom search
diff --git a/datasources/reddit/search_reddit.py b/datasources/reddit/search_reddit.py
index ead44b142..be21608bb 100644
--- a/datasources/reddit/search_reddit.py
+++ b/datasources/reddit/search_reddit.py
@@ -115,7 +115,15 @@ class SearchReddit(Search):
 			"help": "Can query without keyword",
 			"default": False,
 			"tooltip": "Allows users to query Pushshift without specifying a keyword. This can lead to HUGE datasets!"
-		}
+		},
+        "explorer.reddit-search-explorer-css": {
+            "type": UserInput.OPTION_TEXT_LARGE,
+            "help": "Reddit CSS",
+            "default": "",
+            "tooltip":  "Custom CSS for Reddit posts in the the Explorer. This allows to "
+                        "mimic the original platform appearance. If empty, use the default "
+                        "CSS template (which is also editable on this page)."
+        }
 	}
 
 	# These change depending on the API type used,
diff --git a/datasources/telegram/search_telegram.py b/datasources/telegram/search_telegram.py
index e0e9bb142..2b30aa07f 100644
--- a/datasources/telegram/search_telegram.py
+++ b/datasources/telegram/search_telegram.py
@@ -66,6 +66,14 @@ class SearchTelegram(Search):
             "default": 25,
             "tooltip": "Amount of entities that can be queried at a time. Entities are groups or channels. 0 to "
                        "disable limit."
+        },
+        "explorer.telegram-search-explorer-css": {
+            "type": UserInput.OPTION_TEXT_LARGE,
+            "help": "Telegram CSS",
+            "default": "",
+            "tooltip":  "Custom CSS for Telegram posts in the the Explorer. This allows to "
+                        "mimic the original platform appearance. If empty, use the default "
+                        "CSS template (which is also editable on this page)."
         }
     }
 
diff --git a/datasources/tiktok/search_tiktok.py b/datasources/tiktok/search_tiktok.py
index 90f443b49..29e082769 100644
--- a/datasources/tiktok/search_tiktok.py
+++ b/datasources/tiktok/search_tiktok.py
@@ -9,6 +9,7 @@
 
 from backend.lib.search import Search
 from common.lib.item_mapping import MappedItem
+from common.lib.helpers import UserInput
 
 
 class SearchTikTok(Search):
@@ -29,6 +30,17 @@ class SearchTikTok(Search):
         "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)"
     ]
 
+    config = {
+        "explorer.tiktok-search-explorer-css": {
+                "type": UserInput.OPTION_TEXT_LARGE,
+                "help": "Tiktok CSS",
+                "default": "",
+                "tooltip":  "Custom CSS for Tiktok posts in the the Explorer. This allows to "
+                            "mimic the original platform appearance. If empty, use the default "
+                            "CSS template (which is also editable on this page)."
+            }
+    }
+
     def get_items(self, query):
         """
         Run custom search
diff --git a/datasources/tiktok_urls/search_tiktok_urls.py b/datasources/tiktok_urls/search_tiktok_urls.py
index d8864be91..8a61d0f92 100644
--- a/datasources/tiktok_urls/search_tiktok_urls.py
+++ b/datasources/tiktok_urls/search_tiktok_urls.py
@@ -46,7 +46,15 @@ class SearchTikTokByID(Search):
             "default": 1.0,
             "help": "Request wait",
             "tooltip": "Time to wait before sending a new request from the same IP"
-        }
+        },
+        "explorer.tiktok-urls-search-explorer-css": {
+                "type": UserInput.OPTION_TEXT_LARGE,
+                "help": "Tiktok URLs CSS",
+                "default": "",
+                "tooltip":  "Custom CSS for Tiktok URLs posts in the the Explorer. This allows to "
+                            "mimic the original platform appearance. If empty, use the default "
+                            "CSS template (which is also editable on this page)."
+            }
     }
 
     options = {
diff --git a/datasources/tumblr/explorer/tumblr-explorer.css b/datasources/tumblr/explorer/tumblr-explorer.css
index a7b3df88d..1895e9961 100644
--- a/datasources/tumblr/explorer/tumblr-explorer.css
+++ b/datasources/tumblr/explorer/tumblr-explorer.css
@@ -1,9 +1,3 @@
-/* 
-
-See https://github.com/digitalmethodsinitiative/4cat/wiki/Exploring-and-annotating-datasets for information on how to add custom CSS.
- 
- */
-
 body {
 	background-color: #001935;
 }
diff --git a/datasources/tumblr/search_tumblr.py b/datasources/tumblr/search_tumblr.py
index 0ce4328dc..07f6a394d 100644
--- a/datasources/tumblr/search_tumblr.py
+++ b/datasources/tumblr/search_tumblr.py
@@ -73,6 +73,14 @@ class SearchTumblr(Search):
 			'help': 'Tumblr API Secret Key',
 			'tooltip': "",
 		},
+		"explorer.tumblr-search-explorer-css": {
+			"type": UserInput.OPTION_TEXT_LARGE,
+			"help": "CSS Tumblr",
+			"default": "",
+			"tooltip":  "Custom CSS for Tumblr posts in the the Explorer. This allows to "
+						"mimic the original platform appearance. If empty, use the default "
+						"CSS template (which is also editable on this page)."
+		}
 	}
 	references = ["[Tumblr API documentation](https://www.tumblr.com/docs/en/api/v2)"]
 
@@ -484,9 +492,9 @@ def get_posts_by_blog(self, blog, max_date=None, min_date=None):
 		Get Tumblr posts posts with a certain blog
 		:param tag, str: the name of the blog you want to look for
 		:param min_date: a unix timestamp, indicates posts should be min_date this date.
-	    :param max_date: a unix timestamp, indicates posts should be max_date this date.
+		:param max_date: a unix timestamp, indicates posts should be max_date this date.
 
-	    :returns: a dict created from the JSON response
+		:returns: a dict created from the JSON response
 		"""
 		blog = blog + ".tumblr.com"
 
diff --git a/datasources/twitter-import/search_twitter.py b/datasources/twitter-import/search_twitter.py
index 8e8d39e30..b08854e40 100644
--- a/datasources/twitter-import/search_twitter.py
+++ b/datasources/twitter-import/search_twitter.py
@@ -9,6 +9,7 @@
 from backend.lib.search import Search
 from common.lib.helpers import strip_tags
 from common.lib.item_mapping import MappedItem
+from common.lib.helpers import UserInput
 
 
 class SearchTwitterViaZeeschuimer(Search):
@@ -29,6 +30,17 @@ class SearchTwitterViaZeeschuimer(Search):
         "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)"
     ]
 
+    config = {
+        "explorer.twitter-import-explorer-css": {
+                "type": UserInput.OPTION_TEXT_LARGE,
+                "help": "Twitter import CSS",
+                "default": "",
+                "tooltip":  "Custom CSS for Twitter import posts in the the Explorer. This allows to "
+                            "mimic the original platform appearance. If empty, use the default "
+                            "CSS template (which is also editable on this page)."
+            }
+    }
+
     def get_items(self, query):
         """
         Run custom search
diff --git a/datasources/twitterv2/search_twitter.py b/datasources/twitterv2/search_twitter.py
index a3dbb4482..fe3069d0b 100644
--- a/datasources/twitterv2/search_twitter.py
+++ b/datasources/twitterv2/search_twitter.py
@@ -61,7 +61,15 @@ class SearchWithTwitterAPIv2(Search):
             "tooltip": "If enabled, allow users to enter a list of tweet IDs "
                        "to retrieve. This is disabled by default because it "
                        "can be confusing to novice users."
-        }
+        },
+        "explorer.twitter-search-explorer-css": {
+                "type": UserInput.OPTION_TEXT_LARGE,
+                "help": "Twitter CSS",
+                "default": "",
+                "tooltip":  "Custom CSS for Twitter posts in the the Explorer. This allows to "
+                            "mimic the original platform appearance. If empty, use the default "
+                            "CSS template (which is also editable on this page)."
+            }
     }
 
     def get_items(self, query):
diff --git a/datasources/vk/search_vk.py b/datasources/vk/search_vk.py
index d04daba0a..9efc09a85 100644
--- a/datasources/vk/search_vk.py
+++ b/datasources/vk/search_vk.py
@@ -31,6 +31,17 @@ class SearchVK(Search):
         "[Python API wrapper](https://github.com/python273/vk_api)"
     ]
 
+    config = {
+        "explorer.vk-import-explorer-css": {
+                "type": UserInput.OPTION_TEXT_LARGE,
+                "help": "VK import CSS",
+                "default": "",
+                "tooltip":  "Custom CSS for VK import posts in the the Explorer. This allows to "
+                            "mimic the original platform appearance. If empty, use the default "
+                            "CSS template (which is also editable on this page)."
+            }
+    }
+
     expanded_profile_fields = "id,screen_name,first_name,last_name,name,deactivated,is_closed,is_admin,sex,city,country,photo_200,photo_100,photo_50,followers_count,members_count"  # https://vk.com/dev/objects/user & https://vk.com/dev/objects/group
 
     @classmethod
diff --git a/webtool/views/views_explorer.py b/webtool/views/views_explorer.py
index d07ab5273..62c90df4e 100644
--- a/webtool/views/views_explorer.py
+++ b/webtool/views/views_explorer.py
@@ -79,7 +79,7 @@ def explorer_dataset(key, page):
 
 	# If the dataset is local, we can add some more features
 	# (like the ability to navigate to threads)
-	is_local = False
+	is_local = False # CHANGE LATER /////////////////////
 
 	if datasource in list(all_modules.datasources.keys()):
 		is_local = True if all_modules.datasources[datasource].get("is_local") else False
@@ -128,9 +128,13 @@ def explorer_dataset(key, page):
 	except NotImplementedError:
 		return error(404)
 
-	# Include custom css if it exists in the datasource's 'explorer' dir.
-	# The file's naming format should e.g. be 'reddit-explorer.css'.
+	# Retrieve custom CSS if it is present in the datasource's config.
+	# If not given, we use a standard template. This standard CSS template
+	# can also be changed in the 4CAT control panel under the 'Explorer'
+	# settings.
 	css = get_custom_css(datasource)
+	print(datasource)
+	print("CSS", css)
 
 	# Include custom fields if it they are in the datasource's 'explorer' dir.
 	# The file's naming format should e.g. be 'reddit-explorer.json'.
@@ -609,46 +613,20 @@ def get_local_posts(db, datasource, ids, board="", threads=False, limit=0, offse
 
 def get_custom_css(datasource):
 	"""
-	Check if there's a custom css file for this dataset.
-	If so, return the text.
-	Custom css files should be placed in an 'explorer' directory in the the datasource folder and named
-	'<datasourcename>-explorer.css' (e.g. 'reddit/explorer/reddit-explorer.css').
-	See https://github.com/digitalmethodsinitiative/4cat/wiki/Exploring-and-annotating-datasets for more information.
+	Check if there's custom CSS for this data source.
+	These can be inserted and edited on the Explorer settings page.
+	If these are absent, we revert to a standard template.
 
 	:param datasource, str: Datasource name
 
 	:return: The css as string.
 	"""
 
-	# Set the directory name of this datasource.
-	# Some naming inconsistensies are caught here
-	if datasource == "twitter":
-		datasource_dir = "twitter-import"
-		datasource = "twitter-import"
-	else:
-		datasource_dir = datasource
-
-
-	css_path = Path(config.get('PATH_ROOT'), "datasources", datasource_dir, "explorer", datasource.lower() + "-explorer.css")
-	
-	print(css_path)
-	read = False
-	if css_path.exists():
-		read = True
-	else:
-		# Allow both hypens and underscores in datasource name (to avoid some legacy issues)
-		css_path = re.sub(datasource, datasource.replace("-", "_"), str(css_path.absolute()))
-		if Path(css_path).exists():
-			read = True
-
-	# Read the css file if it exists
-	if read:
-		with open(css_path, "r", encoding="utf-8") as css:
-			css = css.read()
-	else:
-		css = None
+	custom_css = config.get("explorer." + datasource + "-explorer-css", "")
+	if not custom_css:
+		custom_css = config.get("explorer." + datasource + "-search-explorer-css", "")
 
-	return css
+	return custom_css
 
 def get_custom_fields(datasource, filetype=None):
 	"""

From 0fe3ea64075d59fde8a536bc7194a638a59ba026 Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Tue, 9 Apr 2024 17:33:51 +0200
Subject: [PATCH 005/204] Move around Explorer CSS files

---
 .../static/css/{explorer.css => explorer/explorer-default.css} | 0
 webtool/static/css/explorer/telegram-search.css                | 3 +++
 2 files changed, 3 insertions(+)
 rename webtool/static/css/{explorer.css => explorer/explorer-default.css} (100%)
 create mode 100644 webtool/static/css/explorer/telegram-search.css

diff --git a/webtool/static/css/explorer.css b/webtool/static/css/explorer/explorer-default.css
similarity index 100%
rename from webtool/static/css/explorer.css
rename to webtool/static/css/explorer/explorer-default.css
diff --git a/webtool/static/css/explorer/telegram-search.css b/webtool/static/css/explorer/telegram-search.css
new file mode 100644
index 000000000..8b6e10cd1
--- /dev/null
+++ b/webtool/static/css/explorer/telegram-search.css
@@ -0,0 +1,3 @@
+* {
+	color: gold;
+}
\ No newline at end of file

From e06760aa6438a7191759cf791cad83eca122741a Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Tue, 9 Apr 2024 17:34:18 +0200
Subject: [PATCH 006/204] Edit custom Explorer CSS options

---
 common/lib/config_definition.py           | 17 +++++++++++++++--
 datasources/bitchute/search_bitchute.py   |  4 +---
 datasources/douban/search_douban.py       |  6 ++----
 datasources/douyin/search_douyin.py       |  4 +---
 datasources/eightchan/search_8chan.py     |  6 ++----
 datasources/eightkun/search_8kun.py       |  6 ++----
 datasources/fourchan/search_4chan.py      |  6 ++----
 datasources/instagram/search_instagram.py |  4 +---
 datasources/reddit/search_reddit.py       | 10 +---------
 datasources/telegram/search_telegram.py   | 12 ++++++++----
 webtool/views/views_explorer.py           |  2 --
 11 files changed, 35 insertions(+), 42 deletions(-)

diff --git a/common/lib/config_definition.py b/common/lib/config_definition.py
index b1746bb78..42e42b083 100644
--- a/common/lib/config_definition.py
+++ b/common/lib/config_definition.py
@@ -305,7 +305,7 @@
         "global": True
     },
     # Explorer settings
-    "explorer.max_posts": {
+    "explorer.__max_posts": {
         "type": UserInput.OPTION_TEXT,
         "default": 100000,
         "help": "Amount of posts",
@@ -313,13 +313,26 @@
         "tooltip": "Maximum number of posts to be considered by the Explorer (prevents timeouts and "
                    "memory errors)"
     },
-    "explorer.posts_per_page": {
+    "explorer.__posts_per_page": {
         "type": UserInput.OPTION_TEXT,
         "default": 50,
         "help": "Posts per page",
         "coerce_type": int,
         "tooltip": "Number of posts to display per page"
     },
+    "explorer._explanation_custom_fields": {
+        "type": UserInput.OPTION_INFO,
+        "help": "You can customise how posts per data source appear in the Explorer. "
+                "This involves *custom fields* via a JSON that points to what fields should "
+                "be displayed. These fields can also be formatted in a specific ways, for "
+                "instance as a URL or together with specific icons. If this JSON is absent, "
+                "the Explorer by default shows the `author`, `subject`, `timestamp`, `body`, and"
+                " `image` fields. *Custom CSS* can be added to change the appearance of posts. "
+                "This allows to mimic the original platform appearance. Custom CSS can be inserted "
+                "below. For some data sources, pre-made templates are available. These be toggled "
+                "below. If no custom or pre-made CSS is available, a general template is used."
+                "<a href='tst'>tsts</a>"
+    },
     # Web tool settings
     # These are used by the FlaskConfig class in config.py
     # Flask may require a restart to update them
diff --git a/datasources/bitchute/search_bitchute.py b/datasources/bitchute/search_bitchute.py
index 37849891c..b42d317b3 100644
--- a/datasources/bitchute/search_bitchute.py
+++ b/datasources/bitchute/search_bitchute.py
@@ -94,9 +94,7 @@ class SearchBitChute(Search):
                 "type": UserInput.OPTION_TEXT_LARGE,
                 "help": "Bitchute CSS",
                 "default": "",
-                "tooltip":  "Custom CSS for Bitchute posts in the the Explorer. This allows to "
-                            "mimic the original platform appearance. If empty, use the default "
-                            "CSS template (which is also editable on this page)."
+                "tooltip":  "Add custom styling for Bitchute posts in the the Explorer."
         }
     }
 
diff --git a/datasources/douban/search_douban.py b/datasources/douban/search_douban.py
index 841bb6037..2a7d1a23f 100644
--- a/datasources/douban/search_douban.py
+++ b/datasources/douban/search_douban.py
@@ -78,11 +78,9 @@ class SearchDouban(Search):
     config = {
         "explorer.douban-search-explorer-css": {
                 "type": UserInput.OPTION_TEXT_LARGE,
-                "help": "TCAT CSS",
+                "help": "Douban CSS",
                 "default": "",
-                "tooltip":  "Custom CSS for Douban posts in the the Explorer. This allows to "
-                            "mimic the original platform appearance. If empty, use the default "
-                            "CSS template (which is also editable on this page)."
+                "tooltip":  "Add custom styling for Douban posts in the the Explorer."
         }
     }
 
diff --git a/datasources/douyin/search_douyin.py b/datasources/douyin/search_douyin.py
index b53aab2a4..9d926123d 100644
--- a/datasources/douyin/search_douyin.py
+++ b/datasources/douyin/search_douyin.py
@@ -33,9 +33,7 @@ class SearchDouyin(Search):
                 "type": UserInput.OPTION_TEXT_LARGE,
                 "help": "Douyin CSS",
                 "default": "",
-                "tooltip":  "Custom CSS for Douyin posts in the the Explorer. This allows to "
-                            "mimic the original platform appearance. If empty, use the default "
-                            "CSS template (which is also editable on this page)."
+                "tooltip":  "Add custom styling for Douyin posts in the the Explorer."
             }
         }
 
diff --git a/datasources/eightchan/search_8chan.py b/datasources/eightchan/search_8chan.py
index fb8970808..57b92987a 100644
--- a/datasources/eightchan/search_8chan.py
+++ b/datasources/eightchan/search_8chan.py
@@ -109,12 +109,10 @@ class Search8Chan(Search4Chan):
 			"default": [],
 			"global": True
 		},
-		"explorer.eightchan-explorer-css": {
+		"explorer.eightchan-search-explorer-css": {
                 "type": UserInput.OPTION_TEXT_LARGE,
                 "help": "8chan CSS",
                 "default": "",
-                "tooltip":  "Custom CSS for 8chan posts in the the Explorer. This allows to "
-                            "mimic the original platform appearance. If empty, use the default "
-                            "CSS template (which is also editable on this page)."
+                "tooltip":  "Add custom styling for 8chan posts in the the Explorer."
             }
 	}
diff --git a/datasources/eightkun/search_8kun.py b/datasources/eightkun/search_8kun.py
index 647434f6f..333daa55e 100644
--- a/datasources/eightkun/search_8kun.py
+++ b/datasources/eightkun/search_8kun.py
@@ -112,12 +112,10 @@ class Search8Kun(Search4Chan):
 			"default": [],
 			"global": True
 		},
-		"explorer.eightkun-explorer-css": {
+		"explorer.eightkun-search-explorer-css": {
                 "type": UserInput.OPTION_TEXT_LARGE,
                 "help": "8kun CSS",
                 "default": "",
-                "tooltip":  "Custom CSS for 8kun posts in the the Explorer. This allows to "
-                            "mimic the original platform appearance. If empty, use the default "
-                            "CSS template (which is also editable on this page)."
+                "tooltip":  "Add custom styling for 8kun posts in the the Explorer."
             }
 	}
\ No newline at end of file
diff --git a/datasources/fourchan/search_4chan.py b/datasources/fourchan/search_4chan.py
index 7e8638caf..d0bfb8d84 100644
--- a/datasources/fourchan/search_4chan.py
+++ b/datasources/fourchan/search_4chan.py
@@ -443,13 +443,11 @@ class Search4Chan(SearchWithScope):
 			"default": False,
 			"tooltip": "Allows users to query the 4chan data without specifying a keyword. This can lead to HUGE datasets!"
 		},
-		"explorer.fourchan-explorer-css": {
+		"explorer.fourchan-search-explorer-css": {
                 "type": UserInput.OPTION_TEXT_LARGE,
                 "help": "4chan CSS",
                 "default": "",
-                "tooltip":  "Custom CSS for 4chan posts in the the Explorer. This allows to "
-                            "mimic the original platform appearance. If empty, use the default "
-                            "CSS template (which is also editable on this page)."
+                "tooltip":  "Add custom styling for 4chan posts in the the Explorer."
             }
 	}
 
diff --git a/datasources/instagram/search_instagram.py b/datasources/instagram/search_instagram.py
index 0118aef36..32a7a75d6 100644
--- a/datasources/instagram/search_instagram.py
+++ b/datasources/instagram/search_instagram.py
@@ -36,9 +36,7 @@ class SearchInstagram(Search):
                 "type": UserInput.OPTION_TEXT_LARGE,
                 "help": "Instagram CSS",
                 "default": "",
-                "tooltip":  "Custom CSS for Instagram posts in the the Explorer. This allows to "
-                            "mimic the original platform appearance. If empty, use the default "
-                            "CSS template (which is also editable on this page)."
+                "tooltip":  "Add custom styling for Instagram posts in the the Explorer."
             }
     }
 
diff --git a/datasources/reddit/search_reddit.py b/datasources/reddit/search_reddit.py
index be21608bb..ead44b142 100644
--- a/datasources/reddit/search_reddit.py
+++ b/datasources/reddit/search_reddit.py
@@ -115,15 +115,7 @@ class SearchReddit(Search):
 			"help": "Can query without keyword",
 			"default": False,
 			"tooltip": "Allows users to query Pushshift without specifying a keyword. This can lead to HUGE datasets!"
-		},
-        "explorer.reddit-search-explorer-css": {
-            "type": UserInput.OPTION_TEXT_LARGE,
-            "help": "Reddit CSS",
-            "default": "",
-            "tooltip":  "Custom CSS for Reddit posts in the the Explorer. This allows to "
-                        "mimic the original platform appearance. If empty, use the default "
-                        "CSS template (which is also editable on this page)."
-        }
+		}
 	}
 
 	# These change depending on the API type used,
diff --git a/datasources/telegram/search_telegram.py b/datasources/telegram/search_telegram.py
index 2b30aa07f..a32789f19 100644
--- a/datasources/telegram/search_telegram.py
+++ b/datasources/telegram/search_telegram.py
@@ -67,13 +67,17 @@ class SearchTelegram(Search):
             "tooltip": "Amount of entities that can be queried at a time. Entities are groups or channels. 0 to "
                        "disable limit."
         },
+        "explorer.telegram-search-explorer-default-css": {
+            "type": UserInput.OPTION_TOGGLE,
+            "help": "Use Telegram default CSS",
+            "default": "",
+            "tooltip":  "Add custom styling for Telegram posts in the the Explorer."
+        },
         "explorer.telegram-search-explorer-css": {
             "type": UserInput.OPTION_TEXT_LARGE,
-            "help": "Telegram CSS",
+            "help": "Custom Telegram CSS",
             "default": "",
-            "tooltip":  "Custom CSS for Telegram posts in the the Explorer. This allows to "
-                        "mimic the original platform appearance. If empty, use the default "
-                        "CSS template (which is also editable on this page)."
+            "tooltip":  "Add custom styling for Telegram posts in the the Explorer."
         }
     }
 
diff --git a/webtool/views/views_explorer.py b/webtool/views/views_explorer.py
index 62c90df4e..b02dcf1c2 100644
--- a/webtool/views/views_explorer.py
+++ b/webtool/views/views_explorer.py
@@ -133,8 +133,6 @@ def explorer_dataset(key, page):
 	# can also be changed in the 4CAT control panel under the 'Explorer'
 	# settings.
 	css = get_custom_css(datasource)
-	print(datasource)
-	print("CSS", css)
 
 	# Include custom fields if it they are in the datasource's 'explorer' dir.
 	# The file's naming format should e.g. be 'reddit-explorer.json'.

From a921967749fd14c3aaf7ddaf8df80a4a539da675 Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Tue, 9 Apr 2024 17:35:03 +0200
Subject: [PATCH 007/204] Forgot to save these

---
 common/lib/config_definition.py         | 1 -
 datasources/telegram/search_telegram.py | 6 +++---
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/common/lib/config_definition.py b/common/lib/config_definition.py
index 42e42b083..d113f8c09 100644
--- a/common/lib/config_definition.py
+++ b/common/lib/config_definition.py
@@ -331,7 +331,6 @@
                 "This allows to mimic the original platform appearance. Custom CSS can be inserted "
                 "below. For some data sources, pre-made templates are available. These be toggled "
                 "below. If no custom or pre-made CSS is available, a general template is used."
-                "<a href='tst'>tsts</a>"
     },
     # Web tool settings
     # These are used by the FlaskConfig class in config.py
diff --git a/datasources/telegram/search_telegram.py b/datasources/telegram/search_telegram.py
index a32789f19..60db3d76e 100644
--- a/datasources/telegram/search_telegram.py
+++ b/datasources/telegram/search_telegram.py
@@ -69,9 +69,9 @@ class SearchTelegram(Search):
         },
         "explorer.telegram-search-explorer-default-css": {
             "type": UserInput.OPTION_TOGGLE,
-            "help": "Use Telegram default CSS",
-            "default": "",
-            "tooltip":  "Add custom styling for Telegram posts in the the Explorer."
+            "help": "Use default Telegram CSS",
+            "default": True
+            "tooltip":  "See "
         },
         "explorer.telegram-search-explorer-css": {
             "type": UserInput.OPTION_TEXT_LARGE,

From e37ebc97a1f29132b291dcb073f73744493307b8 Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Wed, 10 Apr 2024 11:49:47 +0200
Subject: [PATCH 008/204] Typozzz

---
 common/lib/config_definition.py               | 16 ++++++++--------
 datasources/bitchute/search_bitchute.py       |  2 +-
 datasources/douban/search_douban.py           |  2 +-
 datasources/douyin/search_douyin.py           |  2 +-
 datasources/eightchan/search_8chan.py         |  2 +-
 datasources/eightkun/search_8kun.py           |  2 +-
 datasources/fourchan/search_4chan.py          |  2 +-
 datasources/instagram/search_instagram.py     |  2 +-
 datasources/linkedin/search_linkedin.py       |  2 +-
 datasources/parler/search_parler.py           |  2 +-
 datasources/telegram/search_telegram.py       |  2 +-
 datasources/tiktok/search_tiktok.py           |  2 +-
 datasources/tiktok_urls/search_tiktok_urls.py |  2 +-
 datasources/tumblr/search_tumblr.py           |  2 +-
 datasources/twitter-import/search_twitter.py  |  2 +-
 datasources/twitterv2/search_twitter.py       |  2 +-
 datasources/vk/search_vk.py                   |  2 +-
 17 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/common/lib/config_definition.py b/common/lib/config_definition.py
index d113f8c09..1e5d5949d 100644
--- a/common/lib/config_definition.py
+++ b/common/lib/config_definition.py
@@ -323,14 +323,14 @@
     "explorer._explanation_custom_fields": {
         "type": UserInput.OPTION_INFO,
         "help": "You can customise how posts per data source appear in the Explorer. "
-                "This involves *custom fields* via a JSON that points to what fields should "
-                "be displayed. These fields can also be formatted in a specific ways, for "
-                "instance as a URL or together with specific icons. If this JSON is absent, "
-                "the Explorer by default shows the `author`, `subject`, `timestamp`, `body`, and"
-                " `image` fields. *Custom CSS* can be added to change the appearance of posts. "
-                "This allows to mimic the original platform appearance. Custom CSS can be inserted "
-                "below. For some data sources, pre-made templates are available. These be toggled "
-                "below. If no custom or pre-made CSS is available, a general template is used."
+                "This involves *custom fields*; a JSON that points to what fields should "
+                "be displayed. These fields can be formatted, for instance as a URL or together "
+                " with specific icons. If this JSON is absent, the Explorer by default shows the "
+                "`author`, `subject`, `timestamp`, `body`, and `image` fields. *Custom CSS* can be "
+                "added to change the appearance of posts. This allows to mimic the original platform "
+                "appearance. Custom CSS can be inserted below. For some data sources, pre-made templates "
+                "are available. These can be toggled below. If no custom or pre-made CSS is available, a "
+                "general template is used."
     },
     # Web tool settings
     # These are used by the FlaskConfig class in config.py
diff --git a/datasources/bitchute/search_bitchute.py b/datasources/bitchute/search_bitchute.py
index b42d317b3..28a899237 100644
--- a/datasources/bitchute/search_bitchute.py
+++ b/datasources/bitchute/search_bitchute.py
@@ -94,7 +94,7 @@ class SearchBitChute(Search):
                 "type": UserInput.OPTION_TEXT_LARGE,
                 "help": "Bitchute CSS",
                 "default": "",
-                "tooltip":  "Add custom styling for Bitchute posts in the the Explorer."
+                "tooltip":  "Add custom styling for Bitchute posts in the Explorer."
         }
     }
 
diff --git a/datasources/douban/search_douban.py b/datasources/douban/search_douban.py
index 2a7d1a23f..704fd8a23 100644
--- a/datasources/douban/search_douban.py
+++ b/datasources/douban/search_douban.py
@@ -80,7 +80,7 @@ class SearchDouban(Search):
                 "type": UserInput.OPTION_TEXT_LARGE,
                 "help": "Douban CSS",
                 "default": "",
-                "tooltip":  "Add custom styling for Douban posts in the the Explorer."
+                "tooltip":  "Add custom styling for Douban posts in the Explorer."
         }
     }
 
diff --git a/datasources/douyin/search_douyin.py b/datasources/douyin/search_douyin.py
index 9d926123d..ebf9b4450 100644
--- a/datasources/douyin/search_douyin.py
+++ b/datasources/douyin/search_douyin.py
@@ -33,7 +33,7 @@ class SearchDouyin(Search):
                 "type": UserInput.OPTION_TEXT_LARGE,
                 "help": "Douyin CSS",
                 "default": "",
-                "tooltip":  "Add custom styling for Douyin posts in the the Explorer."
+                "tooltip":  "Add custom styling for Douyin posts in the Explorer."
             }
         }
 
diff --git a/datasources/eightchan/search_8chan.py b/datasources/eightchan/search_8chan.py
index 57b92987a..fdc3fc555 100644
--- a/datasources/eightchan/search_8chan.py
+++ b/datasources/eightchan/search_8chan.py
@@ -113,6 +113,6 @@ class Search8Chan(Search4Chan):
                 "type": UserInput.OPTION_TEXT_LARGE,
                 "help": "8chan CSS",
                 "default": "",
-                "tooltip":  "Add custom styling for 8chan posts in the the Explorer."
+                "tooltip":  "Add custom styling for 8chan posts in the Explorer."
             }
 	}
diff --git a/datasources/eightkun/search_8kun.py b/datasources/eightkun/search_8kun.py
index 333daa55e..e32c4d4e2 100644
--- a/datasources/eightkun/search_8kun.py
+++ b/datasources/eightkun/search_8kun.py
@@ -116,6 +116,6 @@ class Search8Kun(Search4Chan):
                 "type": UserInput.OPTION_TEXT_LARGE,
                 "help": "8kun CSS",
                 "default": "",
-                "tooltip":  "Add custom styling for 8kun posts in the the Explorer."
+                "tooltip":  "Add custom styling for 8kun posts in the Explorer."
             }
 	}
\ No newline at end of file
diff --git a/datasources/fourchan/search_4chan.py b/datasources/fourchan/search_4chan.py
index d0bfb8d84..8a54812be 100644
--- a/datasources/fourchan/search_4chan.py
+++ b/datasources/fourchan/search_4chan.py
@@ -447,7 +447,7 @@ class Search4Chan(SearchWithScope):
                 "type": UserInput.OPTION_TEXT_LARGE,
                 "help": "4chan CSS",
                 "default": "",
-                "tooltip":  "Add custom styling for 4chan posts in the the Explorer."
+                "tooltip":  "Add custom styling for 4chan posts in the Explorer."
             }
 	}
 
diff --git a/datasources/instagram/search_instagram.py b/datasources/instagram/search_instagram.py
index 32a7a75d6..fa22cedaf 100644
--- a/datasources/instagram/search_instagram.py
+++ b/datasources/instagram/search_instagram.py
@@ -36,7 +36,7 @@ class SearchInstagram(Search):
                 "type": UserInput.OPTION_TEXT_LARGE,
                 "help": "Instagram CSS",
                 "default": "",
-                "tooltip":  "Add custom styling for Instagram posts in the the Explorer."
+                "tooltip":  "Add custom styling for Instagram posts in the Explorer."
             }
     }
 
diff --git a/datasources/linkedin/search_linkedin.py b/datasources/linkedin/search_linkedin.py
index 99c2e8efb..65df1d55b 100644
--- a/datasources/linkedin/search_linkedin.py
+++ b/datasources/linkedin/search_linkedin.py
@@ -36,7 +36,7 @@ class SearchLinkedIn(Search):
                 "type": UserInput.OPTION_TEXT_LARGE,
                 "help": "LinkedIn CSS",
                 "default": "",
-                "tooltip":  "Custom CSS for LinkedIn posts in the the Explorer. This allows to "
+                "tooltip":  "Custom CSS for LinkedIn posts in the Explorer. This allows to "
                             "mimic the original platform appearance. If empty, use the default "
                             "CSS template (which is also editable on this page)."
             }
diff --git a/datasources/parler/search_parler.py b/datasources/parler/search_parler.py
index 3ceb95b3a..fab89e8ae 100644
--- a/datasources/parler/search_parler.py
+++ b/datasources/parler/search_parler.py
@@ -32,7 +32,7 @@ class SearchParler(Search):
                 "type": UserInput.OPTION_TEXT_LARGE,
                 "help": "Parler CSS",
                 "default": "",
-                "tooltip":  "Custom CSS for Parler posts in the the Explorer. This allows to "
+                "tooltip":  "Custom CSS for Parler posts in the Explorer. This allows to "
                             "mimic the original platform appearance. If empty, use the default "
                             "CSS template (which is also editable on this page)."
             }
diff --git a/datasources/telegram/search_telegram.py b/datasources/telegram/search_telegram.py
index 60db3d76e..e8496c1ab 100644
--- a/datasources/telegram/search_telegram.py
+++ b/datasources/telegram/search_telegram.py
@@ -77,7 +77,7 @@ class SearchTelegram(Search):
             "type": UserInput.OPTION_TEXT_LARGE,
             "help": "Custom Telegram CSS",
             "default": "",
-            "tooltip":  "Add custom styling for Telegram posts in the the Explorer."
+            "tooltip":  "Add custom styling for Telegram posts in the Explorer."
         }
     }
 
diff --git a/datasources/tiktok/search_tiktok.py b/datasources/tiktok/search_tiktok.py
index 29e082769..6aff822cc 100644
--- a/datasources/tiktok/search_tiktok.py
+++ b/datasources/tiktok/search_tiktok.py
@@ -35,7 +35,7 @@ class SearchTikTok(Search):
                 "type": UserInput.OPTION_TEXT_LARGE,
                 "help": "Tiktok CSS",
                 "default": "",
-                "tooltip":  "Custom CSS for Tiktok posts in the the Explorer. This allows to "
+                "tooltip":  "Custom CSS for Tiktok posts in the Explorer. This allows to "
                             "mimic the original platform appearance. If empty, use the default "
                             "CSS template (which is also editable on this page)."
             }
diff --git a/datasources/tiktok_urls/search_tiktok_urls.py b/datasources/tiktok_urls/search_tiktok_urls.py
index 8a61d0f92..82e8b1f1b 100644
--- a/datasources/tiktok_urls/search_tiktok_urls.py
+++ b/datasources/tiktok_urls/search_tiktok_urls.py
@@ -51,7 +51,7 @@ class SearchTikTokByID(Search):
                 "type": UserInput.OPTION_TEXT_LARGE,
                 "help": "Tiktok URLs CSS",
                 "default": "",
-                "tooltip":  "Custom CSS for Tiktok URLs posts in the the Explorer. This allows to "
+                "tooltip":  "Custom CSS for Tiktok URLs posts in the Explorer. This allows to "
                             "mimic the original platform appearance. If empty, use the default "
                             "CSS template (which is also editable on this page)."
             }
diff --git a/datasources/tumblr/search_tumblr.py b/datasources/tumblr/search_tumblr.py
index 07f6a394d..0dc72c04a 100644
--- a/datasources/tumblr/search_tumblr.py
+++ b/datasources/tumblr/search_tumblr.py
@@ -77,7 +77,7 @@ class SearchTumblr(Search):
 			"type": UserInput.OPTION_TEXT_LARGE,
 			"help": "CSS Tumblr",
 			"default": "",
-			"tooltip":  "Custom CSS for Tumblr posts in the the Explorer. This allows to "
+			"tooltip":  "Custom CSS for Tumblr posts in the Explorer. This allows to "
 						"mimic the original platform appearance. If empty, use the default "
 						"CSS template (which is also editable on this page)."
 		}
diff --git a/datasources/twitter-import/search_twitter.py b/datasources/twitter-import/search_twitter.py
index b08854e40..b1d5a25d1 100644
--- a/datasources/twitter-import/search_twitter.py
+++ b/datasources/twitter-import/search_twitter.py
@@ -35,7 +35,7 @@ class SearchTwitterViaZeeschuimer(Search):
                 "type": UserInput.OPTION_TEXT_LARGE,
                 "help": "Twitter import CSS",
                 "default": "",
-                "tooltip":  "Custom CSS for Twitter import posts in the the Explorer. This allows to "
+                "tooltip":  "Custom CSS for Twitter import posts in the Explorer. This allows to "
                             "mimic the original platform appearance. If empty, use the default "
                             "CSS template (which is also editable on this page)."
             }
diff --git a/datasources/twitterv2/search_twitter.py b/datasources/twitterv2/search_twitter.py
index fe3069d0b..76f7395bb 100644
--- a/datasources/twitterv2/search_twitter.py
+++ b/datasources/twitterv2/search_twitter.py
@@ -66,7 +66,7 @@ class SearchWithTwitterAPIv2(Search):
                 "type": UserInput.OPTION_TEXT_LARGE,
                 "help": "Twitter CSS",
                 "default": "",
-                "tooltip":  "Custom CSS for Twitter posts in the the Explorer. This allows to "
+                "tooltip":  "Custom CSS for Twitter posts in the Explorer. This allows to "
                             "mimic the original platform appearance. If empty, use the default "
                             "CSS template (which is also editable on this page)."
             }
diff --git a/datasources/vk/search_vk.py b/datasources/vk/search_vk.py
index 9efc09a85..f4b42421e 100644
--- a/datasources/vk/search_vk.py
+++ b/datasources/vk/search_vk.py
@@ -36,7 +36,7 @@ class SearchVK(Search):
                 "type": UserInput.OPTION_TEXT_LARGE,
                 "help": "VK import CSS",
                 "default": "",
-                "tooltip":  "Custom CSS for VK import posts in the the Explorer. This allows to "
+                "tooltip":  "Custom CSS for VK import posts in the Explorer. This allows to "
                             "mimic the original platform appearance. If empty, use the default "
                             "CSS template (which is also editable on this page)."
             }

From a7668f0061461f69f98bc11a8ec7b3992df342b8 Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Wed, 10 Apr 2024 17:28:40 +0200
Subject: [PATCH 009/204] First setup for dynamic Explorer options in Settings

---
 common/lib/config_definition.py         | 72 +++++++++++++++++++++++--
 datasources/telegram/search_telegram.py | 13 +----
 2 files changed, 70 insertions(+), 15 deletions(-)

diff --git a/common/lib/config_definition.py b/common/lib/config_definition.py
index 1e5d5949d..fec3f680d 100644
--- a/common/lib/config_definition.py
+++ b/common/lib/config_definition.py
@@ -305,7 +305,7 @@
         "global": True
     },
     # Explorer settings
-    "explorer.__max_posts": {
+    "explorer._max_posts": {
         "type": UserInput.OPTION_TEXT,
         "default": 100000,
         "help": "Amount of posts",
@@ -313,14 +313,26 @@
         "tooltip": "Maximum number of posts to be considered by the Explorer (prevents timeouts and "
                    "memory errors)"
     },
-    "explorer.__posts_per_page": {
+    "explorer.posts_per_page": {
         "type": UserInput.OPTION_TEXT,
         "default": 50,
         "help": "Posts per page",
         "coerce_type": int,
         "tooltip": "Number of posts to display per page"
     },
-    "explorer._explanation_custom_fields": {
+    "explorer.explanation_custom_fields": {
+        "type": UserInput.OPTION_INFO,
+        "help": "You can customise how posts per data source appear in the Explorer. "
+                "This involves *custom fields*; a JSON that points to what fields should "
+                "be displayed. These fields can be formatted, for instance as a URL or together "
+                " with specific icons. If this JSON is absent, the Explorer by default shows the "
+                "`author`, `subject`, `timestamp`, `body`, and `image` fields. *Custom CSS* can be "
+                "added to change the appearance of posts. This allows to mimic the original platform "
+                "appearance. Custom CSS can be inserted below. For some data sources, pre-made templates "
+                "are available. These can be toggled below. If no custom or pre-made CSS is available, a "
+                "general template is used."
+    },
+    "explorer.explanation_custom_fields": {
         "type": UserInput.OPTION_INFO,
         "help": "You can customise how posts per data source appear in the Explorer. "
                 "This involves *custom fields*; a JSON that points to what fields should "
@@ -520,6 +532,60 @@
     },
 }
 
+# Dynamically add some Explorer options per data source.
+# These are all the same, so we're looping over
+# data sources to avoid redunancy.
+modules = ["4chan", "telegram"]
+for module in modules:
+    print(module)
+    # Explorer custom fields: default template, data source preset, or custom.
+    explorer_options = {
+        "explorer." + module + "-fields": {
+            "type": UserInput.OPTION_CHOICE,
+            "help": module + " fields",
+            "options": {
+                "general": "General fields",
+                "custom": "Custom (insert below)"
+            },
+            "default": "general"
+        },
+        # Custom Explorer fields JSON
+        "explorer." + module + "-fields-json": {
+            "type": UserInput.OPTION_TEXT_LARGE,
+            "help": "Custom " + module + " fields",
+            "default": "",
+            "tooltip":  "Add custom fields for " + module + " posts in the Explorer."
+        },
+        # Explorer CSS: default template, data source preset, or custom.
+        "explorer." + module + "-css": {
+            "type": UserInput.OPTION_CHOICE,
+            "help": module + " CSS",
+            "options": {
+                "general": "General fields",
+                "custom": "Custom (insert below)"
+            },
+            "default": "general"
+        },
+        # Custom Explorer CSS
+        "explorer." + module + "-css-text": {
+            "type": UserInput.OPTION_TEXT_LARGE,
+            "help": "Custom " + module + " CSS",
+            "default": "",
+            "tooltip":  "Add custom styling for " + module + " posts in the Explorer."
+        }
+    }
+
+    # If this data source has preset custom fields and CSS stylesheets
+    # (which must be signalled via the `has_explorer_preset` attribute in the
+    # data source script), we're adding the default option to select this preset.
+    if module:
+        explorer_options["explorer." + module + "-fields"]["options"]["preset"] = "Data source preset"
+        explorer_options["explorer." + module + "-fields"]["default"] = "preset"
+        explorer_options["explorer." + module + "-css"]["options"]["preset"] = "Data source preset"
+        explorer_options["explorer." + module + "-css"]["default"] = "preset"
+    
+    config_definition = {**config_definition, **explorer_options}
+
 # These are used in the web interface for more readable names
 # Can't think of a better place to put them...
 categories = {
diff --git a/datasources/telegram/search_telegram.py b/datasources/telegram/search_telegram.py
index e8496c1ab..477cd9999 100644
--- a/datasources/telegram/search_telegram.py
+++ b/datasources/telegram/search_telegram.py
@@ -39,6 +39,7 @@ class SearchTelegram(Search):
     extension = "ndjson"  # extension of result file, used internally and in UI
     is_local = False  # Whether this datasource is locally scraped
     is_static = False  # Whether this datasource is still updated
+    has_explorer_preset = True # Whether this data source has preset CSS and field settings for the Explorer 
 
     # cache
     details_cache = None
@@ -66,18 +67,6 @@ class SearchTelegram(Search):
             "default": 25,
             "tooltip": "Amount of entities that can be queried at a time. Entities are groups or channels. 0 to "
                        "disable limit."
-        },
-        "explorer.telegram-search-explorer-default-css": {
-            "type": UserInput.OPTION_TOGGLE,
-            "help": "Use default Telegram CSS",
-            "default": True
-            "tooltip":  "See "
-        },
-        "explorer.telegram-search-explorer-css": {
-            "type": UserInput.OPTION_TEXT_LARGE,
-            "help": "Custom Telegram CSS",
-            "default": "",
-            "tooltip":  "Add custom styling for Telegram posts in the Explorer."
         }
     }
 

From 59e33b0c83736fdeabf32964ff658c6e6a69be8f Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Mon, 15 Apr 2024 11:40:37 +0200
Subject: [PATCH 010/204] First steps to datasource table user input

---
 common/lib/config_definition.py               | 37 +++++++++++++-
 common/lib/user_input.py                      |  8 +++
 datasources/bitchute/search_bitchute.py       |  9 ----
 datasources/douban/search_douban.py           |  9 ----
 datasources/douyin/search_douyin.py           | 11 +---
 datasources/eightchan/search_8chan.py         |  8 +--
 datasources/eightkun/search_8kun.py           |  8 +--
 datasources/fourchan/search_4chan.py          |  8 +--
 datasources/instagram/search_instagram.py     |  9 ----
 datasources/linkedin/search_linkedin.py       | 11 ----
 datasources/parler/search_parler.py           | 11 ----
 datasources/telegram/search_telegram.py       | 12 -----
 datasources/tiktok/search_tiktok.py           | 11 ----
 datasources/tiktok_urls/search_tiktok_urls.py | 10 +---
 datasources/tumblr/search_tumblr.py           |  8 ---
 datasources/twitter-import/search_twitter.py  | 13 +----
 datasources/twitterv2/search_twitter.py       | 10 +---
 datasources/vk/search_vk.py                   | 13 +----
 webtool/lib/template_filters.py               |  4 +-
 .../components/datasource-option.html         | 50 +++++++++++++++++++
 webtool/templates/controlpanel/config.html    |  1 +
 21 files changed, 104 insertions(+), 157 deletions(-)

diff --git a/common/lib/config_definition.py b/common/lib/config_definition.py
index 1e5d5949d..d43ed9d99 100644
--- a/common/lib/config_definition.py
+++ b/common/lib/config_definition.py
@@ -305,7 +305,7 @@
         "global": True
     },
     # Explorer settings
-    "explorer.__max_posts": {
+    "explorer._max_posts": {
         "type": UserInput.OPTION_TEXT,
         "default": 100000,
         "help": "Amount of posts",
@@ -313,13 +313,46 @@
         "tooltip": "Maximum number of posts to be considered by the Explorer (prevents timeouts and "
                    "memory errors)"
     },
-    "explorer.__posts_per_page": {
+    "explorer._posts_per_page": {
         "type": UserInput.OPTION_TEXT,
         "default": 50,
         "help": "Posts per page",
         "coerce_type": int,
         "tooltip": "Number of posts to display per page"
     },
+    "explorer.explorer_config": {
+        "type": UserInput.OPTION_DATASOURCES_TABLE,
+        "help": "Explorer settings per data source",
+        "default": {"fourchan": {"enabled": True, "css": "preset", "fields": "custom", "test": "TEST"}, "tumblr": {"enabled": True, "css": "preset", "fields": "custom"}},
+        "columns": {
+            "enabled": {
+                "type": "toggle",
+                "help": "Enable Explorer"
+            },
+            "name": {
+                "type": "text",
+                "help": "Test"
+            },
+            "fields": {
+                "type": "choice",
+                "help": "Fields",
+                "options": {
+                    "general": "Default fields",
+                    "preset": "Data source preset",
+                    "custom": "Custom (insert below)"
+                }
+            },
+            "css": {
+                "type": "choice",
+                "help": "CSS",
+                "options": {
+                    "general": "Default template",
+                    "preset": "Data source preset",
+                    "custom": "Custom (insert below)"
+                }
+            }
+        }
+    },
     "explorer._explanation_custom_fields": {
         "type": UserInput.OPTION_INFO,
         "help": "You can customise how posts per data source appear in the Explorer. "
diff --git a/common/lib/user_input.py b/common/lib/user_input.py
index 248421b5c..9d9996f11 100644
--- a/common/lib/user_input.py
+++ b/common/lib/user_input.py
@@ -35,6 +35,8 @@ class UserInput:
     OPTION_FILE = "file"  # file upload
     OPTION_HUE = "hue"  # colour hue
     OPTION_DATASOURCES = "datasources"  # data source toggling
+    OPTION_DATASOURCES_TABLE = "datasources_table" # a table with settings per data source
+    OPTION_DATASOURCES_TEXT = "datasources_text" # text input per data source (via dropdown)
 
     OPTIONS_COSMETIC = (OPTION_INFO, OPTION_DIVIDER)
 
@@ -142,6 +144,9 @@ def parse_all(options, input, silently_correct=True):
 
                 parsed_input[option] = [datasource for datasource, v in datasources.items() if v["enabled"]]
                 parsed_input[option.split(".")[0] + ".expiration"] = datasources
+            elif settings.get("type") == UserInput.OPTION_DATASOURCES_TABLE:
+                # special case, loop through a table to generate a JSON
+                print("yea")
 
             elif option not in input:
                 # not provided? use default
@@ -338,6 +343,9 @@ def parse_value(settings, choice, other_input=None, silently_correct=True):
             else:
                 return choice
 
+        elif input_type == UserInput.DATASOURCES_TABLE:
+            return "weeird"
+
         else:
             # no filtering
             return choice
diff --git a/datasources/bitchute/search_bitchute.py b/datasources/bitchute/search_bitchute.py
index 28a899237..c15540a50 100644
--- a/datasources/bitchute/search_bitchute.py
+++ b/datasources/bitchute/search_bitchute.py
@@ -89,15 +89,6 @@ class SearchBitChute(Search):
 
     }
 
-    config =  {
-            "explorer.bitchute-search-explorer-css": {
-                "type": UserInput.OPTION_TEXT_LARGE,
-                "help": "Bitchute CSS",
-                "default": "",
-                "tooltip":  "Add custom styling for Bitchute posts in the Explorer."
-        }
-    }
-
     def get_items(self, query):
         """
         Run custom search
diff --git a/datasources/douban/search_douban.py b/datasources/douban/search_douban.py
index 704fd8a23..0fb983fbe 100644
--- a/datasources/douban/search_douban.py
+++ b/datasources/douban/search_douban.py
@@ -75,15 +75,6 @@ class SearchDouban(Search):
         }
     }
 
-    config = {
-        "explorer.douban-search-explorer-css": {
-                "type": UserInput.OPTION_TEXT_LARGE,
-                "help": "Douban CSS",
-                "default": "",
-                "tooltip":  "Add custom styling for Douban posts in the Explorer."
-        }
-    }
-
     def get_items(self, query):
         """
         Get Douban posts
diff --git a/datasources/douyin/search_douyin.py b/datasources/douyin/search_douyin.py
index ebf9b4450..3997cd74c 100644
--- a/datasources/douyin/search_douyin.py
+++ b/datasources/douyin/search_douyin.py
@@ -27,16 +27,7 @@ class SearchDouyin(Search):
         "[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)",
         "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)"
     ]
-
-    config = {
-            "explorer.douyin-search-explorer-css": {
-                "type": UserInput.OPTION_TEXT_LARGE,
-                "help": "Douyin CSS",
-                "default": "",
-                "tooltip":  "Add custom styling for Douyin posts in the Explorer."
-            }
-        }
-
+    
     def get_items(self, query):
         """
         Run custom search
diff --git a/datasources/eightchan/search_8chan.py b/datasources/eightchan/search_8chan.py
index fdc3fc555..b3d6702b8 100644
--- a/datasources/eightchan/search_8chan.py
+++ b/datasources/eightchan/search_8chan.py
@@ -108,11 +108,5 @@ class Search8Chan(Search4Chan):
 			"tooltip": "These boards will not be scraped, but can still be indexed if added to 'Boards to index'",
 			"default": [],
 			"global": True
-		},
-		"explorer.eightchan-search-explorer-css": {
-                "type": UserInput.OPTION_TEXT_LARGE,
-                "help": "8chan CSS",
-                "default": "",
-                "tooltip":  "Add custom styling for 8chan posts in the Explorer."
-            }
+		}
 	}
diff --git a/datasources/eightkun/search_8kun.py b/datasources/eightkun/search_8kun.py
index e32c4d4e2..e54e69d3f 100644
--- a/datasources/eightkun/search_8kun.py
+++ b/datasources/eightkun/search_8kun.py
@@ -111,11 +111,5 @@ class Search8Kun(Search4Chan):
 			"tooltip": "These boards will not be scraped, but can still be indexed if added to 'Boards to index'",
 			"default": [],
 			"global": True
-		},
-		"explorer.eightkun-search-explorer-css": {
-                "type": UserInput.OPTION_TEXT_LARGE,
-                "help": "8kun CSS",
-                "default": "",
-                "tooltip":  "Add custom styling for 8kun posts in the Explorer."
-            }
+		}
 	}
\ No newline at end of file
diff --git a/datasources/fourchan/search_4chan.py b/datasources/fourchan/search_4chan.py
index 8a54812be..7b69b872e 100644
--- a/datasources/fourchan/search_4chan.py
+++ b/datasources/fourchan/search_4chan.py
@@ -442,13 +442,7 @@ class Search4Chan(SearchWithScope):
 			"help": "Can query without keyword",
 			"default": False,
 			"tooltip": "Allows users to query the 4chan data without specifying a keyword. This can lead to HUGE datasets!"
-		},
-		"explorer.fourchan-search-explorer-css": {
-                "type": UserInput.OPTION_TEXT_LARGE,
-                "help": "4chan CSS",
-                "default": "",
-                "tooltip":  "Add custom styling for 4chan posts in the Explorer."
-            }
+		}
 	}
 
 	def get_items_simple(self, query):
diff --git a/datasources/instagram/search_instagram.py b/datasources/instagram/search_instagram.py
index fa22cedaf..daa42471d 100644
--- a/datasources/instagram/search_instagram.py
+++ b/datasources/instagram/search_instagram.py
@@ -31,15 +31,6 @@ class SearchInstagram(Search):
         "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok) (also covers usage with Instagram)"
     ]
 
-    config = {
-        "explorer.instagram-search-explorer-css": {
-                "type": UserInput.OPTION_TEXT_LARGE,
-                "help": "Instagram CSS",
-                "default": "",
-                "tooltip":  "Add custom styling for Instagram posts in the Explorer."
-            }
-    }
-
     # some magic numbers instagram uses
     MEDIA_TYPE_PHOTO = 1
     MEDIA_TYPE_VIDEO = 2
diff --git a/datasources/linkedin/search_linkedin.py b/datasources/linkedin/search_linkedin.py
index 65df1d55b..ef29353d4 100644
--- a/datasources/linkedin/search_linkedin.py
+++ b/datasources/linkedin/search_linkedin.py
@@ -31,17 +31,6 @@ class SearchLinkedIn(Search):
         "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok) (also explains general usage of Zeeschuimer)"
     ]
 
-    config = {
-        "explorer.linkedin-search-explorer-css": {
-                "type": UserInput.OPTION_TEXT_LARGE,
-                "help": "LinkedIn CSS",
-                "default": "",
-                "tooltip":  "Custom CSS for LinkedIn posts in the Explorer. This allows to "
-                            "mimic the original platform appearance. If empty, use the default "
-                            "CSS template (which is also editable on this page)."
-            }
-    }
-
     def get_items(self, query):
         """
         Run custom search
diff --git a/datasources/parler/search_parler.py b/datasources/parler/search_parler.py
index fab89e8ae..07b6116ce 100644
--- a/datasources/parler/search_parler.py
+++ b/datasources/parler/search_parler.py
@@ -27,17 +27,6 @@ class SearchParler(Search):
     # not available as a processor for existing datasets
     accepts = [None]
 
-    config = {
-        "explorer.parler-search-explorer-css": {
-                "type": UserInput.OPTION_TEXT_LARGE,
-                "help": "Parler CSS",
-                "default": "",
-                "tooltip":  "Custom CSS for Parler posts in the Explorer. This allows to "
-                            "mimic the original platform appearance. If empty, use the default "
-                            "CSS template (which is also editable on this page)."
-            }
-    }
-
     def get_items(self, query):
         """
         Run custom search
diff --git a/datasources/telegram/search_telegram.py b/datasources/telegram/search_telegram.py
index e8496c1ab..e0e9bb142 100644
--- a/datasources/telegram/search_telegram.py
+++ b/datasources/telegram/search_telegram.py
@@ -66,18 +66,6 @@ class SearchTelegram(Search):
             "default": 25,
             "tooltip": "Amount of entities that can be queried at a time. Entities are groups or channels. 0 to "
                        "disable limit."
-        },
-        "explorer.telegram-search-explorer-default-css": {
-            "type": UserInput.OPTION_TOGGLE,
-            "help": "Use default Telegram CSS",
-            "default": True
-            "tooltip":  "See "
-        },
-        "explorer.telegram-search-explorer-css": {
-            "type": UserInput.OPTION_TEXT_LARGE,
-            "help": "Custom Telegram CSS",
-            "default": "",
-            "tooltip":  "Add custom styling for Telegram posts in the Explorer."
         }
     }
 
diff --git a/datasources/tiktok/search_tiktok.py b/datasources/tiktok/search_tiktok.py
index 6aff822cc..b3214bc42 100644
--- a/datasources/tiktok/search_tiktok.py
+++ b/datasources/tiktok/search_tiktok.py
@@ -30,17 +30,6 @@ class SearchTikTok(Search):
         "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)"
     ]
 
-    config = {
-        "explorer.tiktok-search-explorer-css": {
-                "type": UserInput.OPTION_TEXT_LARGE,
-                "help": "Tiktok CSS",
-                "default": "",
-                "tooltip":  "Custom CSS for Tiktok posts in the Explorer. This allows to "
-                            "mimic the original platform appearance. If empty, use the default "
-                            "CSS template (which is also editable on this page)."
-            }
-    }
-
     def get_items(self, query):
         """
         Run custom search
diff --git a/datasources/tiktok_urls/search_tiktok_urls.py b/datasources/tiktok_urls/search_tiktok_urls.py
index 82e8b1f1b..d8864be91 100644
--- a/datasources/tiktok_urls/search_tiktok_urls.py
+++ b/datasources/tiktok_urls/search_tiktok_urls.py
@@ -46,15 +46,7 @@ class SearchTikTokByID(Search):
             "default": 1.0,
             "help": "Request wait",
             "tooltip": "Time to wait before sending a new request from the same IP"
-        },
-        "explorer.tiktok-urls-search-explorer-css": {
-                "type": UserInput.OPTION_TEXT_LARGE,
-                "help": "Tiktok URLs CSS",
-                "default": "",
-                "tooltip":  "Custom CSS for Tiktok URLs posts in the Explorer. This allows to "
-                            "mimic the original platform appearance. If empty, use the default "
-                            "CSS template (which is also editable on this page)."
-            }
+        }
     }
 
     options = {
diff --git a/datasources/tumblr/search_tumblr.py b/datasources/tumblr/search_tumblr.py
index 0dc72c04a..191fec22e 100644
--- a/datasources/tumblr/search_tumblr.py
+++ b/datasources/tumblr/search_tumblr.py
@@ -72,14 +72,6 @@ class SearchTumblr(Search):
 			'default': "",
 			'help': 'Tumblr API Secret Key',
 			'tooltip': "",
-		},
-		"explorer.tumblr-search-explorer-css": {
-			"type": UserInput.OPTION_TEXT_LARGE,
-			"help": "CSS Tumblr",
-			"default": "",
-			"tooltip":  "Custom CSS for Tumblr posts in the Explorer. This allows to "
-						"mimic the original platform appearance. If empty, use the default "
-						"CSS template (which is also editable on this page)."
 		}
 	}
 	references = ["[Tumblr API documentation](https://www.tumblr.com/docs/en/api/v2)"]
diff --git a/datasources/twitter-import/search_twitter.py b/datasources/twitter-import/search_twitter.py
index b1d5a25d1..7984cc69b 100644
--- a/datasources/twitter-import/search_twitter.py
+++ b/datasources/twitter-import/search_twitter.py
@@ -29,18 +29,7 @@ class SearchTwitterViaZeeschuimer(Search):
         "[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)",
         "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)"
     ]
-
-    config = {
-        "explorer.twitter-import-explorer-css": {
-                "type": UserInput.OPTION_TEXT_LARGE,
-                "help": "Twitter import CSS",
-                "default": "",
-                "tooltip":  "Custom CSS for Twitter import posts in the Explorer. This allows to "
-                            "mimic the original platform appearance. If empty, use the default "
-                            "CSS template (which is also editable on this page)."
-            }
-    }
-
+    
     def get_items(self, query):
         """
         Run custom search
diff --git a/datasources/twitterv2/search_twitter.py b/datasources/twitterv2/search_twitter.py
index 76f7395bb..a3dbb4482 100644
--- a/datasources/twitterv2/search_twitter.py
+++ b/datasources/twitterv2/search_twitter.py
@@ -61,15 +61,7 @@ class SearchWithTwitterAPIv2(Search):
             "tooltip": "If enabled, allow users to enter a list of tweet IDs "
                        "to retrieve. This is disabled by default because it "
                        "can be confusing to novice users."
-        },
-        "explorer.twitter-search-explorer-css": {
-                "type": UserInput.OPTION_TEXT_LARGE,
-                "help": "Twitter CSS",
-                "default": "",
-                "tooltip":  "Custom CSS for Twitter posts in the Explorer. This allows to "
-                            "mimic the original platform appearance. If empty, use the default "
-                            "CSS template (which is also editable on this page)."
-            }
+        }
     }
 
     def get_items(self, query):
diff --git a/datasources/vk/search_vk.py b/datasources/vk/search_vk.py
index f4b42421e..22c5581a9 100644
--- a/datasources/vk/search_vk.py
+++ b/datasources/vk/search_vk.py
@@ -30,18 +30,7 @@ class SearchVK(Search):
         "[VK API documentation](https://vk.com/dev/first_guide)",
         "[Python API wrapper](https://github.com/python273/vk_api)"
     ]
-
-    config = {
-        "explorer.vk-import-explorer-css": {
-                "type": UserInput.OPTION_TEXT_LARGE,
-                "help": "VK import CSS",
-                "default": "",
-                "tooltip":  "Custom CSS for VK import posts in the Explorer. This allows to "
-                            "mimic the original platform appearance. If empty, use the default "
-                            "CSS template (which is also editable on this page)."
-            }
-    }
-
+    
     expanded_profile_fields = "id,screen_name,first_name,last_name,name,deactivated,is_closed,is_admin,sex,city,country,photo_200,photo_100,photo_50,followers_count,members_count"  # https://vk.com/dev/objects/user & https://vk.com/dev/objects/group
 
     @classmethod
diff --git a/webtool/lib/template_filters.py b/webtool/lib/template_filters.py
index 9cbe1897e..dd04d3eb1 100644
--- a/webtool/lib/template_filters.py
+++ b/webtool/lib/template_filters.py
@@ -230,8 +230,8 @@ def _jinja2_filter_4chan_image(image_4chan, post_id, board, image_md5):
 
 
 
-@app.template_filter('post_field')
-def _jinja2_filter_post_field(field, post):
+@app.template_filter('dict_field')
+def _jinja2_filter_dict_field(field, post):
 	# Extracts string values between {{ two curly brackets }} and uses that
 	# as a dictionary key for the given dict. It then returns the corresponding value.
 	# Mainly used in the Explorer.
diff --git a/webtool/templates/components/datasource-option.html b/webtool/templates/components/datasource-option.html
index 7c6c788de..32d414360 100644
--- a/webtool/templates/components/datasource-option.html
+++ b/webtool/templates/components/datasource-option.html
@@ -147,6 +147,56 @@
                 <p role="tooltip" class="multiple" id="tooltip-setting-timeout" aria-hidden="true">In seconds; time after which datasets are automatically deleted. 0 or empty for no expiration. WARNING: after changing, newly eligible datasets will be deleted immediately!</p>
                 <p role="tooltip" class="multiple" id="tooltip-setting-optout" aria-hidden="true">Can users opt out of expiration? Overrides global setting.</p>
             </div>
+        {% elif settings.type == "datasources_table" %}
+            {{settings}}
+            <br>
+            {{option}}
+            <br>
+            {{options.option}}
+            <div class="datasource-toggle-form">
+                <input name="option-{{ option }}" type="hidden" value="{{ datasources_config|list|join(",") }}">
+                <table>
+                    <thead>
+                        <tr>
+                            <th>Data source</th>
+                            {% for column_id, column in settings.columns.items() %}
+                                <th>{{ column.help }}</th>
+                            {% endfor %}
+                        </tr>
+                    </thead>
+                    <tbody>
+                    {% for datasource, datasource_config in datasources_config.items() %}
+                        {% if datasource_config.enabled %}
+
+                        {% if datasource in settings.default %}
+                            {% set default_settings = settings.default[datasource] %}
+                        {% else %}
+                            {% default_settings = None %}
+                        {% endif %}
+                        <tr>
+                            <td>{{ datasource_config.name }}</td>
+                            {% for column_id, column in settings.columns.items() %}
+                            <td>
+                                {% if column.type == "text" %}
+                                    <input type="text" name="option-{{ option }}-{{ datasource }}" value="{{ default_settings[column_id] if def }}">
+                                }
+                                {% elif column.type == "toggle" %}
+                                    <input type="checkbox" name="option-{{ option }}-{{ datasource }}" value="">
+                                {% elif column.type == "choice" %}
+                                    <select name="option-{{ option }}-{{ datasource }}">
+                                    {% for value, label in column.options.items() %}
+                                        <option value="{{ value }}">{{ label }}</option>
+                                    {% endfor %}
+                                    </select>
+                                {% endif %}
+                            </td>
+                            {% endfor %}
+                        </tr>
+                        {% endif %}
+                    {% endfor %}
+                    </tbody>
+                </table>
+            </div>
         {% endif %}
             </div>
         </div>
diff --git a/webtool/templates/controlpanel/config.html b/webtool/templates/controlpanel/config.html
index ca5a7930d..cdc19d453 100644
--- a/webtool/templates/controlpanel/config.html
+++ b/webtool/templates/controlpanel/config.html
@@ -77,6 +77,7 @@ <h2>
                         {% else %}
                             {% set settings = options[option] %}
                         {% endif %}
+                        
                         {% include 'components/datasource-option.html' %}
                         </div>
                     {% endfor %}

From 46628c62a6e9f720100000ddfc9c74f97dbad45e Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Mon, 15 Apr 2024 17:19:43 +0200
Subject: [PATCH 011/204] Add basic UserInput.DATASOURCES_TABLE functionality,
 and use in Explorer settings page

---
 common/lib/config_definition.py               | 75 +++++++++++--------
 common/lib/user_input.py                      | 19 +++--
 .../components/datasource-option.html         | 38 +++++-----
 3 files changed, 77 insertions(+), 55 deletions(-)

diff --git a/common/lib/config_definition.py b/common/lib/config_definition.py
index d43ed9d99..482b5969a 100644
--- a/common/lib/config_definition.py
+++ b/common/lib/config_definition.py
@@ -305,7 +305,13 @@
         "global": True
     },
     # Explorer settings
-    "explorer._max_posts": {
+    "explorer.__basic-explanation": {
+        "type": UserInput.OPTION_INFO,
+        "help": "4CAT's Explorer feature lets you navigate and annotate datasets as if they "
+                "appared on their original platform. This is intended to facilitate qualitative "
+                "exploration and manual coding."
+    },
+    "explorer.__max_posts": {
         "type": UserInput.OPTION_TEXT,
         "default": 100000,
         "help": "Amount of posts",
@@ -313,58 +319,65 @@
         "tooltip": "Maximum number of posts to be considered by the Explorer (prevents timeouts and "
                    "memory errors)"
     },
-    "explorer._posts_per_page": {
+    "explorer.__posts_per_page": {
         "type": UserInput.OPTION_TEXT,
         "default": 50,
         "help": "Posts per page",
         "coerce_type": int,
         "tooltip": "Number of posts to display per page"
     },
+    "explorer._config_explanation": {
+        "type": UserInput.OPTION_INFO,
+        "help": "Per data source, you can enable/disable the Explorer and customise how posts appear. "
+                "The latter involves *what fields to show* and *how posts are styled*. Many data sources have tailored "
+                "[presets for this](https://github.com/digitalmethodsinitiative/4cat/tree/master/webtool/static/"
+                "explorer-presets). If presets are unavailable, a general template for [fields](https://github.com/"
+                "digitalmethodsinitiative/4cat/tree/master/webtool/static/explorer-presets/default-fields.json) "
+                "and [CSS styling](https://github.com/digitalmethodsinitiative/4cat/tree/master/webtool/static/"
+                "explorer-presets/default-css.css) is used. You can also toggle between data source presets and the general "
+                "template via the table below."
+    },
+    # "explorer._config_explanation2": {
+    #     "type": UserInput.OPTION_INFO,
+    #     "help": "Alternatively, you can also *customise fields and CSS yourself* by choosing the `Custom` setting in the "
+    #             "table below and inserting JSON and CSS values in the text boxes underneath. See the [wiki for instructions "
+    #             "on how to format custom fields and CSS](https://github.com/digitalmethodsinitiative/4cat/wiki/"
+    #             "Exploring-and-annotating-datasets#add-custom-fields)."
+    # },
     "explorer.explorer_config": {
         "type": UserInput.OPTION_DATASOURCES_TABLE,
         "help": "Explorer settings per data source",
-        "default": {"fourchan": {"enabled": True, "css": "preset", "fields": "custom", "test": "TEST"}, "tumblr": {"enabled": True, "css": "preset", "fields": "custom"}},
+        "default": {"fourchan": {"enabled": True, "css": "preset", "fields": "general"}, "eightchan": {"enabled": True, "css": "general", "fields": "general"}, "eightkun": {"enabled": True, "css": "general", "fields": "general"}, "ninegag": {"enabled": True, "css": "general", "fields": "general"}, "bitchute": {"enabled": True, "css": "general", "fields": "general"}, "dmi-tcat": {"enabled": True, "css": "general", "fields": "general"}, "dmi-tcatv2": {"enabled": True, "css": "general", "fields": "general"}, "douban": {"enabled": True, "css": "general", "fields": "general"}, "douyin": {"enabled": False, "css": "general", "fields": "general"}, "imgur": {"enabled": True, "css": "general", "fields": "general"}, "upload": {"enabled": True, "css": "general", "fields": "general"}, "instagram": {"enabled": True, "css": "preset", "fields": "preset"}, "linkedin": {"enabled": True, "css": "general", "fields": "general"}, "parler": {"enabled": True, "css": "general", "fields": "general"}, "reddit": {"enabled": True, "css": "preset", "fields": "preset"}, "telegram": {"enabled": True, "css": "general", "fields": "general"}, "tiktok": {"enabled": True, "css": "preset", "fields": "preset"}, "tiktok-urls": {"enabled": True, "css": "preset", "fields": "preset"}, "tumblr": {"enabled": True, "css": "preset", "fields": "preset"}, "twitter": {"enabled": True, "css": "preset", "fields": "preset"}, "twitterv2": {"enabled": True, "css": "preset", "fields": "preset"}, "usenet": {"enabled": True, "css": "general", "fields": "general"}, "vk": {"enabled": True, "css": "general", "fields": "general"}},
         "columns": {
             "enabled": {
-                "type": "toggle",
-                "help": "Enable Explorer"
-            },
-            "name": {
-                "type": "text",
-                "help": "Test"
+                "type": UserInput.OPTION_TOGGLE,
+                "help": "Enable",
+                "tooltip": "Whether the Explorer is available for this data source",
+                "default": True
             },
             "fields": {
-                "type": "choice",
+                "type": UserInput.OPTION_CHOICE,
                 "help": "Fields",
                 "options": {
-                    "general": "Default fields",
-                    "preset": "Data source preset",
-                    "custom": "Custom (insert below)"
-                }
+                    "general": "General",
+                    "preset": "Preset"
+                },
+                "default": "general",
+                "tooltip": "What fields to use (see explanation above)"
             },
             "css": {
-                "type": "choice",
+                "type": UserInput.OPTION_CHOICE,
                 "help": "CSS",
                 "options": {
-                    "general": "Default template",
-                    "preset": "Data source preset",
-                    "custom": "Custom (insert below)"
-                }
+                    "general": "General",
+                    "preset": "Preset"
+                },
+                "default": "general",
+                "tooltip": "What CSS styling to use (see explanation above)"
             }
         }
     },
-    "explorer._explanation_custom_fields": {
-        "type": UserInput.OPTION_INFO,
-        "help": "You can customise how posts per data source appear in the Explorer. "
-                "This involves *custom fields*; a JSON that points to what fields should "
-                "be displayed. These fields can be formatted, for instance as a URL or together "
-                " with specific icons. If this JSON is absent, the Explorer by default shows the "
-                "`author`, `subject`, `timestamp`, `body`, and `image` fields. *Custom CSS* can be "
-                "added to change the appearance of posts. This allows to mimic the original platform "
-                "appearance. Custom CSS can be inserted below. For some data sources, pre-made templates "
-                "are available. These can be toggled below. If no custom or pre-made CSS is available, a "
-                "general template is used."
-    },
+    "explorer"
     # Web tool settings
     # These are used by the FlaskConfig class in config.py
     # Flask may require a restart to update them
diff --git a/common/lib/user_input.py b/common/lib/user_input.py
index 9d9996f11..4de5478c2 100644
--- a/common/lib/user_input.py
+++ b/common/lib/user_input.py
@@ -144,9 +144,21 @@ def parse_all(options, input, silently_correct=True):
 
                 parsed_input[option] = [datasource for datasource, v in datasources.items() if v["enabled"]]
                 parsed_input[option.split(".")[0] + ".expiration"] = datasources
+
             elif settings.get("type") == UserInput.OPTION_DATASOURCES_TABLE:
-                # special case, loop through a table to generate a JSON
-                print("yea")
+                # special case, parse table values to generate a dict
+                columns = list(settings["columns"].keys())
+                table_input = {}
+
+                for datasource in list(settings["default"].keys()):
+                    table_input[datasource] = {}
+                    for column in columns:
+
+                        choice = input.get(option + "-" + datasource + "-" + column, False)
+                        column_settings = settings["columns"][column] # sub-settings per column
+                        table_input[datasource][column] = UserInput.parse_value(column_settings, choice, table_input, silently_correct=True)
+
+                parsed_input[option] = table_input
 
             elif option not in input:
                 # not provided? use default
@@ -343,9 +355,6 @@ def parse_value(settings, choice, other_input=None, silently_correct=True):
             else:
                 return choice
 
-        elif input_type == UserInput.DATASOURCES_TABLE:
-            return "weeird"
-
         else:
             # no filtering
             return choice
diff --git a/webtool/templates/components/datasource-option.html b/webtool/templates/components/datasource-option.html
index 32d414360..e52f917d5 100644
--- a/webtool/templates/components/datasource-option.html
+++ b/webtool/templates/components/datasource-option.html
@@ -148,44 +148,41 @@
                 <p role="tooltip" class="multiple" id="tooltip-setting-optout" aria-hidden="true">Can users opt out of expiration? Overrides global setting.</p>
             </div>
         {% elif settings.type == "datasources_table" %}
-            {{settings}}
-            <br>
-            {{option}}
-            <br>
-            {{options.option}}
+            {% set tooltips = {} %}
             <div class="datasource-toggle-form">
-                <input name="option-{{ option }}" type="hidden" value="{{ datasources_config|list|join(",") }}">
                 <table>
                     <thead>
                         <tr>
                             <th>Data source</th>
                             {% for column_id, column in settings.columns.items() %}
-                                <th>{{ column.help }}</th>
+                                <th>{{ column.help }}
+                                {% if column.tooltip %}
+                                <button class="tooltip-trigger" aria-controls="tooltip-setting-{{ column_id }}" aria-label="Extended help for option">?</button>
+                                {% set x = tooltips.__setitem__(column_id, column.tooltip) %}
+                                {% endif %}
+                                </th>
                             {% endfor %}
                         </tr>
                     </thead>
                     <tbody>
                     {% for datasource, datasource_config in datasources_config.items() %}
                         {% if datasource_config.enabled %}
-
-                        {% if datasource in settings.default %}
-                            {% set default_settings = settings.default[datasource] %}
-                        {% else %}
-                            {% default_settings = None %}
-                        {% endif %}
                         <tr>
                             <td>{{ datasource_config.name }}</td>
                             {% for column_id, column in settings.columns.items() %}
                             <td>
-                                {% if column.type == "text" %}
-                                    <input type="text" name="option-{{ option }}-{{ datasource }}" value="{{ default_settings[column_id] if def }}">
-                                }
+                                {% set column_value = "" %}
+                                {% if datasource in settings.default and settings.default[datasource][column_id] %}
+                                    {% set column_value = settings.default[datasource][column_id] %}
+                                {% endif %}
+                                {% if column.type == "string" %}
+                                    <input type="text" name="option-{{ option }}-{{ datasource }}-{{ column_id }}" value="{{ column_value }}">
                                 {% elif column.type == "toggle" %}
-                                    <input type="checkbox" name="option-{{ option }}-{{ datasource }}" value="">
+                                    <input type="checkbox" name="option-{{ option }}-{{ datasource }}-{{ column_id }}" {% if column_value == true %}checked{% endif %}>
                                 {% elif column.type == "choice" %}
-                                    <select name="option-{{ option }}-{{ datasource }}">
+                                    <select name="option-{{ option }}-{{ datasource }}-{{ column_id }}">
                                     {% for value, label in column.options.items() %}
-                                        <option value="{{ value }}">{{ label }}</option>
+                                        <option value="{{ value }}" {% if column_value == value %}selected{% endif %}>{{ label }}</option>
                                     {% endfor %}
                                     </select>
                                 {% endif %}
@@ -196,6 +193,9 @@
                     {% endfor %}
                     </tbody>
                 </table>
+                {% for tooltip, tooltip_text in tooltips.items() %}
+                <p role="tooltip" class="multiple" id="tooltip-setting-{{ tooltip }}" aria-hidden="true">{{ tooltip_text }}</p>
+                {% endfor %}
             </div>
         {% endif %}
             </div>

From 340d1ff50aee44d82ba1533936f8e7b0cb492738 Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Mon, 15 Apr 2024 17:41:51 +0200
Subject: [PATCH 012/204] Simplify config setting name

---
 common/lib/config_definition.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common/lib/config_definition.py b/common/lib/config_definition.py
index 482b5969a..636bb3c82 100644
--- a/common/lib/config_definition.py
+++ b/common/lib/config_definition.py
@@ -344,7 +344,7 @@
     #             "on how to format custom fields and CSS](https://github.com/digitalmethodsinitiative/4cat/wiki/"
     #             "Exploring-and-annotating-datasets#add-custom-fields)."
     # },
-    "explorer.explorer_config": {
+    "explorer.config": {
         "type": UserInput.OPTION_DATASOURCES_TABLE,
         "help": "Explorer settings per data source",
         "default": {"fourchan": {"enabled": True, "css": "preset", "fields": "general"}, "eightchan": {"enabled": True, "css": "general", "fields": "general"}, "eightkun": {"enabled": True, "css": "general", "fields": "general"}, "ninegag": {"enabled": True, "css": "general", "fields": "general"}, "bitchute": {"enabled": True, "css": "general", "fields": "general"}, "dmi-tcat": {"enabled": True, "css": "general", "fields": "general"}, "dmi-tcatv2": {"enabled": True, "css": "general", "fields": "general"}, "douban": {"enabled": True, "css": "general", "fields": "general"}, "douyin": {"enabled": False, "css": "general", "fields": "general"}, "imgur": {"enabled": True, "css": "general", "fields": "general"}, "upload": {"enabled": True, "css": "general", "fields": "general"}, "instagram": {"enabled": True, "css": "preset", "fields": "preset"}, "linkedin": {"enabled": True, "css": "general", "fields": "general"}, "parler": {"enabled": True, "css": "general", "fields": "general"}, "reddit": {"enabled": True, "css": "preset", "fields": "preset"}, "telegram": {"enabled": True, "css": "general", "fields": "general"}, "tiktok": {"enabled": True, "css": "preset", "fields": "preset"}, "tiktok-urls": {"enabled": True, "css": "preset", "fields": "preset"}, "tumblr": {"enabled": True, "css": "preset", "fields": "preset"}, "twitter": {"enabled": True, "css": "preset", "fields": "preset"}, "twitterv2": {"enabled": True, "css": "preset", "fields": "preset"}, "usenet": {"enabled": True, "css": "general", "fields": "general"}, "vk": {"enabled": True, "css": "general", "fields": "general"}},

From dfbe5f3478b08e95639ab6b477110eefb3ab6112 Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Mon, 15 Apr 2024 17:42:05 +0200
Subject: [PATCH 013/204] Only show Explorer when enabled per data source

---
 webtool/templates/components/result-result-row.html | 4 ++--
 webtool/views/views_dataset.py                      | 7 +++++--
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/webtool/templates/components/result-result-row.html b/webtool/templates/components/result-result-row.html
index 6a24e484d..7f71f0ef4 100644
--- a/webtool/templates/components/result-result-row.html
+++ b/webtool/templates/components/result-result-row.html
@@ -45,9 +45,9 @@
         </li>
     {% endif %}
     <li>
-    {% if __user_config("privileges.can_use_explorer") %}
+    {% if __user_config("privileges.can_use_explorer") and has_explorer %}
         <a href="{{ url_for('explorer_dataset', key=dataset.key) }}" class="tooltip-trigger" aria-controls="tooltip-explore-items-{{ dataset.key }}">
-            <i class="fa fa-binoculars" aria-hidden="true"></i> Explore
+            <i class="fa fa-binoculars" aria-hidden="true"></i> Explorer
         </a>
         <p role="tooltip" id="tooltip-explore-items-{{ dataset.key }}" aria-hidden="true">Explore, sort, and add annotations to data interactively</p>
     {% endif %}
diff --git a/webtool/views/views_dataset.py b/webtool/views/views_dataset.py
index 411173a7a..725a965c0 100644
--- a/webtool/views/views_dataset.py
+++ b/webtool/views/views_dataset.py
@@ -423,7 +423,7 @@ def show_result(key):
     datasources = backend.all_modules.datasources
     datasource_expiration = config.get("datasources.expiration", {}).get(datasource, {})
     expires_datasource = False
-    can_unexpire = ((config.get('expire.allow_optout') and \
+    can_unexpire = ((config.get("expire.allow_optout") and \
                      datasource_expiration.get("allow_optout", True)) or datasource_expiration.get("allow_optout",
                                                                                                    False)) \
                    and (current_user.is_admin or dataset.is_accessible_by(current_user, "owner"))
@@ -437,6 +437,8 @@ def show_result(key):
         elif dataset.parameters.get("expires-after"):
             timestamp_expires = dataset.parameters.get("expires-after")
 
+    has_explorer = config.get("explorer.config", {}).get(datasource, {}).get("enabled", False)
+
     # if the dataset has parameters with credentials, give user the option to
     # erase them
     has_credentials = [p for p in dataset.parameters if p.startswith("api_") and p not in ("api_type", "api_track")]
@@ -449,7 +451,8 @@ def show_result(key):
     return render_template(template, dataset=dataset, parent_key=dataset.key, processors=backend.all_modules.processors,
                            is_processor_running=is_processor_running, messages=get_flashed_messages(),
                            is_favourite=is_favourite, timestamp_expires=timestamp_expires, has_credentials=has_credentials,
-                           expires_by_datasource=expires_datasource, can_unexpire=can_unexpire, datasources=datasources)
+                           expires_by_datasource=expires_datasource, can_unexpire=can_unexpire, has_explorer=has_explorer,
+                           datasources=datasources)
 
 
 @app.route('/results/<string:key>/processors/queue/<string:processor>/', methods=["GET", "POST"])

From 28abb423e756693262998e614715232dbc5f28e9 Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Mon, 15 Apr 2024 19:26:21 +0200
Subject: [PATCH 014/204] First steps in integrating the Explorer more with the
 main interface

---
 webtool/static/css/stylesheet.css             |  5 ++
 .../components/datasource-option.html         |  2 +-
 webtool/templates/explorer/controls.html      | 54 +++++++++++++
 webtool/templates/explorer/explorer.html      | 79 ++++++++-----------
 webtool/templates/explorer/footer.html        |  1 -
 webtool/templates/explorer/header.html        | 54 -------------
 webtool/views/views_explorer.py               | 33 ++++----
 7 files changed, 109 insertions(+), 119 deletions(-)
 create mode 100644 webtool/templates/explorer/controls.html
 delete mode 100644 webtool/templates/explorer/footer.html
 delete mode 100644 webtool/templates/explorer/header.html

diff --git a/webtool/static/css/stylesheet.css b/webtool/static/css/stylesheet.css
index acc4409be..0fd5d7733 100644
--- a/webtool/static/css/stylesheet.css
+++ b/webtool/static/css/stylesheet.css
@@ -1205,4 +1205,9 @@ ol.result-list li.has_results .property-container.analysis a {
 .result-list .child-list > li {
     padding: 0;
     margin: 0.5em 0 0 0;
+}
+
+#explorer-posts, #explorer-posts > ol li {
+    all: initial;
+    padding: 0;
 }
\ No newline at end of file
diff --git a/webtool/templates/components/datasource-option.html b/webtool/templates/components/datasource-option.html
index e52f917d5..51e26bf16 100644
--- a/webtool/templates/components/datasource-option.html
+++ b/webtool/templates/components/datasource-option.html
@@ -153,7 +153,7 @@
                 <table>
                     <thead>
                         <tr>
-                            <th>Data source</th>
+                            <th>Enabled data sources</th>
                             {% for column_id, column in settings.columns.items() %}
                                 <th>{{ column.help }}
                                 {% if column.tooltip %}
diff --git a/webtool/templates/explorer/controls.html b/webtool/templates/explorer/controls.html
new file mode 100644
index 000000000..f0bcd793d
--- /dev/null
+++ b/webtool/templates/explorer/controls.html
@@ -0,0 +1,54 @@
+	
+<article class="result" data-dataset-key="{{ dataset.key }}">
+	<h2>
+		{{ dataset.get_label() }}
+	</h2>
+	<div class="fullwidth-notice">
+	<span class="return">
+		<a href="/results/{{ key }}"><i class="fas fa-arrow-left"></i> Return to dataset overview</a>
+	</span>
+	</div>
+	{% if key %}
+		<div id="dataset-info">
+			{% if post_count > max_posts %}
+				<p><strong>Large dataset - only showing the first {{ max_posts }} posts. Use filter processors to limit the dataset.</strong></p>
+				{% set post_count = max_posts %}
+			{% endif %}
+		</div>
+		<!-- <div id="parameters">
+		{% set primary_fields = ("datasource", "board", "body_match", "min_date", "max_date") %}
+		{% for field in primary_fields %}
+			<span>{{ field }}: {{ parameters[field] }}</span>
+		{% endfor%}
+		{% for other_field, value in parameters.items() %}
+			{% if other_field not in primary_fields and value %}
+				<span>{{ other_field }}: {{ value }}</span>
+			{% endif %}
+		{% endfor%}
+		</div> -->
+	{% elif thread %}
+		<p>Showing {{ post_count }} posts from {{ datasource }}/{{ board }} thread {{ thread }}.</p>
+		<p>Note that the archived posts may not be complete.</p>
+	{% endif %}
+
+</article>
+
+	<p>Showing posts {{ offset + 1 }} - {{ post_count if (offset + posts_per_page) > post_count else (offset + posts_per_page) }} ({{ post_count }} in total).</p>
+{% if custom_fields and custom_fields[0] == "invalid" %}
+	<p><strong>Invalid custom fields JSON - can't show posts properly ({{ custom_fields[1] }}).</strong></p>
+{% endif %}
+{% if custom_fields and 'sort_options' in custom_fields %}
+<div id="sort-options">
+	<p>Sort posts by:
+		<select id="sort-select" name="sort-select">
+			<option class="sort-option" value="dataset-order">Dataset order</option>
+			
+			{% for sort_option in custom_fields['sort_options'] %}
+
+				<option class="sort-option" value="{{ sort_option.key }}" {% if "descending" in sort_option %}data-desc={{ sort_option.descending }}{% endif %} {% if "force_int" in sort_option %}data-force-int={{ sort_option.force_int }}{% endif %}>{{ sort_option.label }}</option>
+			
+			{% endfor %}
+		</select>
+	</p>
+</div>
+{% endif %}
\ No newline at end of file
diff --git a/webtool/templates/explorer/explorer.html b/webtool/templates/explorer/explorer.html
index 9a7251990..6cde4748b 100644
--- a/webtool/templates/explorer/explorer.html
+++ b/webtool/templates/explorer/explorer.html
@@ -1,48 +1,37 @@
-<!DOCTYPE html>
-<head>
-	<title>4CAT Explorer &bull; {% if parameters and parameters.get("label") %}{{ parameters.get("label") }}{% elif key %}{{ key }}{% elif thread %}{{ thread }}{% endif %}</title>
-
-	<link rel="stylesheet" type="text/css" href="{{url_for('static', filename='css/explorer.css')}}">
-	<link rel="stylesheet" type="text/css" href="{{url_for('static', filename='css/flags.css')}}">
-	<link rel="stylesheet" type="text/css" href="{{ url_for('static', filename='fontawesome/css/fontawesome.css') }}">
-	<link rel="stylesheet" type="text/css" href="{{ url_for('static', filename='fontawesome/css/regular.css') }}">
-	<link rel="stylesheet" type="text/css" href="{{ url_for('static', filename='fontawesome/css/solid.css') }}">
-	<link rel="stylesheet" type="text/css" href="{{ url_for('static', filename='fontawesome/css/brands.css') }}">
-
-	<!-- favicon -->
-	<link rel="shortcut icon" href="{{url_for('static', filename='img/favicon/favicon-bw.ico')}}">
-	<meta name="msapplication-TileColor" content="#485ba6">
-	<meta name="msapplication-config" content="static/img/favicon/browserconfig.xml">
-	<meta name="theme-color" content="#ffffff">
-
-	<script type="text/javascript" src="{{url_for('static', filename='js/jquery-3.6.3.min.js')}}"></script>
-	<script type="text/javascript" src="{{url_for('static', filename='js/explorer.js')}}"></script>
-
-	<!-- Possible annotation fields -->
-	<script type="text/javascript">
-	var annotation_fields = {% if annotation_fields %}{{ annotation_fields | safe }}{% else %}""{% endif %}
-	</script>
-
-	<!-- Include datasource-specific css if it exists -->
-	{% if custom_css %}
-		<style>
-			{{ custom_css | safe }}"
-		</style>
-	{% endif %}
-</head>
-<body data-url-root="{{ url_for("show_frontpage") }}">
-
-<header>
-	{% include "explorer/header.html" %}
-
-	{% if not thread %}
-		{% include "explorer/annotations.html" %}
-	{% endif %}
-</header>
+{% extends "layout.html" %}
+
+{% block title %}Datasets &amp; previous results{% endblock %}
+{% block body_class %}result-list plain-page{% endblock %}
+{% block breadcrumbs %}{% set navigation.current = "dataset" %}{% endblock %}
+
+{% block body %}
+
+<link rel="stylesheet" type="text/css" href="{{url_for('static', filename='css/explorer.css')}}">
+<link rel="stylesheet" type="text/css" href="{{url_for('static', filename='css/flags.css')}}">
+
+<script type="text/javascript" src="{{url_for('static', filename='js/explorer.js')}}"></script>
+
+<!-- Possible annotation fields -->
+<script type="text/javascript">
+var annotation_fields = {% if annotation_fields %}{{ annotation_fields | safe }}{% else %}""{% endif %}
+</script>
+
+<!-- Include datasource-specific css if it exists -->
+{% if custom_css %}
+	<style>
+		{{ custom_css | safe }}"
+	</style>
+{% endif %}
+
+{% include "explorer/controls.html" %}
+
+{% if not thread %}
+	{% include "explorer/annotations.html" %}
+{% endif %}
 
 <div class="content">
 	{% include "explorer/nav-pages.html" %}
-	<ol class="posts">
+	<ol class="posts" id="explorer-posts">
 	{% for post in posts %}
 		{% include "explorer/post.html" %}
 	{% endfor %}
@@ -50,8 +39,4 @@
 	{% include "explorer/nav-pages.html" %}
 </div>
 
-<footer>
-	{% include "explorer/footer.html" %}
-</footer>
-
-</body>
\ No newline at end of file
+{% endblock %}
\ No newline at end of file
diff --git a/webtool/templates/explorer/footer.html b/webtool/templates/explorer/footer.html
deleted file mode 100644
index 258a61e98..000000000
--- a/webtool/templates/explorer/footer.html
+++ /dev/null
@@ -1 +0,0 @@
-<p>Rendered by 4CAT</p>
\ No newline at end of file
diff --git a/webtool/templates/explorer/header.html b/webtool/templates/explorer/header.html
deleted file mode 100644
index ab3472fa7..000000000
--- a/webtool/templates/explorer/header.html
+++ /dev/null
@@ -1,54 +0,0 @@
-<h1>
-	<span class="return">
-		<a href="/results/{{ key }}"><i class="fas fa-arrow-left"></i> Return to dataset</a>
-	</span>
-		
-	<span>
-		4CAT Explorer {% if parameters and parameters.get("label") %}  &bull; {{ parameters.get("label") }}{% elif thread %}  &bull; {{ thread }}{% endif %}
-	</span>
-</h1>
-<span id="dataset-key">{{ key }}</span>
-<div id="metadata">
-	{% if key %}
-		<div id="dataset-info">
-			{% if post_count > max_posts %}
-				<p><strong>Large dataset - only showing the first {{ max_posts }} posts. Use filter processors to limit the dataset.</strong></p>
-				{% set post_count = max_posts %}
-			{% endif %}
-			<p>Showing posts {{ offset + 1 }} - {{ post_count if (offset + posts_per_page) > post_count else (offset + posts_per_page) }} ({{ post_count }} in total).</p>
-		{% if custom_fields and custom_fields[0] == "invalid" %}
-			<p><strong>Invalid custom fields JSON - can't show posts properly ({{ custom_fields[1] }}).</strong></p>
-		{% endif %}
-		{% if custom_fields and 'sort_options' in custom_fields %}
-		<div id="sort-options">
-			<p>Sort posts by:
-				<select id="sort-select" name="sort-select">
-					<option class="sort-option" value="dataset-order">Dataset order</option>
-					
-					{% for sort_option in custom_fields['sort_options'] %}
-
-						<option class="sort-option" value="{{ sort_option.key }}" {% if "descending" in sort_option %}data-desc={{ sort_option.descending }}{% endif %} {% if "force_int" in sort_option %}data-force-int={{ sort_option.force_int }}{% endif %}>{{ sort_option.label }}</option>
-					
-					{% endfor %}
-				</select>
-			</p>
-		</div>
-		{% endif %}
-		</div>
-		<!-- <div id="parameters">
-		{% set primary_fields = ("datasource", "board", "body_match", "min_date", "max_date") %}
-		{% for field in primary_fields %}
-			<span>{{ field }}: {{ parameters[field] }}</span>
-		{% endfor%}
-		{% for other_field, value in parameters.items() %}
-			{% if other_field not in primary_fields and value %}
-				<span>{{ other_field }}: {{ value }}</span>
-			{% endif %}
-		{% endfor%}
-		</div> -->
-	{% elif thread %}
-		<p>Showing {{ post_count }} posts from {{ datasource }}/{{ board }} thread {{ thread }}.</p>
-		<p>Note that the archived posts may not be complete.</p>
-	{% endif %}
-
-</div>
\ No newline at end of file
diff --git a/webtool/views/views_explorer.py b/webtool/views/views_explorer.py
index b02dcf1c2..437b20f2c 100644
--- a/webtool/views/views_explorer.py
+++ b/webtool/views/views_explorer.py
@@ -28,8 +28,8 @@
 config = ConfigWrapper(config, user=current_user, request=request)
 api_ratelimit = limiter.shared_limit("45 per minute", scope="api")
 
-@app.route('/explorer/dataset/<string:key>/', defaults={'page': 0})
-@app.route('/explorer/dataset/<string:key>/<int:page>')
+@app.route('/results/<string:key>/explorer/', defaults={'page': 0})
+@app.route('/results/<string:key>/explorer/<int:page>')
 @api_ratelimit
 @login_required
 @setting_required("privileges.can_use_explorer")
@@ -50,7 +50,14 @@ def explorer_dataset(key, page):
 		dataset = DataSet(key=key, db=db)
 	except DataSetException:
 		return error(404, error="Dataset not found.")
+	
+	# Load some variables
+	parameters = dataset.get_parameters()
+	datasource = parameters["datasource"]
+	post_count = int(dataset.data["num_rows"])
+	annotation_fields = dataset.get_annotation_fields()
 
+	# See if we can actually serve this page
 	if dataset.is_private and not (config.get("privileges.can_view_all_datasets") or dataset.is_accessible_by(current_user)):
 		return error(403, error="This dataset is private.")
 
@@ -61,6 +68,9 @@ def explorer_dataset(key, page):
 	if not results_path:
 		return error(404, error="This dataset didn't finish executing")
 
+	if datasource not in config.get("explorer.config") and not config["explorer.config"][datasource]["enabled"]:
+		return error(404, error="Explorer functionality disabled for %s" % datasource)
+
 	# The amount of posts to show on a page
 	posts_per_page = config.get("explorer.posts_per_page", 50)
 
@@ -70,19 +80,10 @@ def explorer_dataset(key, page):
 	# The offset for posts depending on the current page
 	offset = ((page - 1) * posts_per_page) if page else 0
 
-	# Load some variables
-	parameters = dataset.get_parameters()
-	datasource = parameters["datasource"]
-	board = parameters.get("board", "")
-	post_count = int(dataset.data["num_rows"])
-	annotation_fields = dataset.get_annotation_fields()
-
-	# If the dataset is local, we can add some more features
-	# (like the ability to navigate to threads)
-	is_local = False # CHANGE LATER /////////////////////
 
-	if datasource in list(all_modules.datasources.keys()):
-		is_local = True if all_modules.datasources[datasource].get("is_local") else False
+	# f the dataset is generated from an API-accessible database, we can add 
+	# extra features like the ability to navigate across posts.
+	has_database = False # CHANGE LATER /////////////////////
 
 	# Check if we have to sort the data.
 	sort_by = request.args.get("sort")
@@ -121,7 +122,7 @@ def explorer_dataset(key, page):
 			post_ids.append(row["id"])
 			posts.append(row)
 
-			# Stop if we exceed the allowed posts per page or max. posts.
+			# Stop if we exceed the allowed posts per page or max posts.
 			if count >= (offset + posts_per_page) or count > max_posts:
 				break
 
@@ -161,7 +162,7 @@ def explorer_dataset(key, page):
 		annotations = json.loads(annotations["annotations"])
 
 	# Generate the HTML page
-	return render_template("explorer/explorer.html", key=key, datasource=datasource, board=board, is_local=is_local, parameters=parameters, annotation_fields=annotation_fields, annotations=annotations, posts=posts, custom_css=css, custom_fields=custom_fields, page=page, offset=offset, posts_per_page=posts_per_page, post_count=post_count, max_posts=max_posts)
+	return render_template("explorer/explorer.html", key=key, datasource=datasource, has_database=has_database, parameters=parameters, annotation_fields=annotation_fields, annotations=annotations, dataset=dataset, posts=posts, custom_css=css, custom_fields=custom_fields, page=page, offset=offset, posts_per_page=posts_per_page, post_count=post_count, max_posts=max_posts)
 
 @app.route('/explorer/thread/<datasource>/<board>/<string:thread_id>')
 @api_ratelimit

From 70d00b1a5a056fc1286b50c502f437ef32432fa9 Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Tue, 16 Apr 2024 17:52:16 +0200
Subject: [PATCH 015/204] First steps in bringing back sorting

---
 common/lib/dataset.py                         |  34 +++-
 common/lib/helpers.py                         |  18 ++-
 .../static/css/explorer/explorer-default.css  | 145 +-----------------
 webtool/static/js/explorer.js                 |   6 +-
 webtool/templates/explorer/controls.html      |  73 ++++-----
 webtool/templates/explorer/explorer.html      |   9 +-
 webtool/templates/explorer/nav-pages.html     |  61 --------
 webtool/templates/explorer/pagination.html    |  63 ++++++++
 webtool/views/views_explorer.py               |  89 ++++-------
 9 files changed, 183 insertions(+), 315 deletions(-)
 delete mode 100644 webtool/templates/explorer/nav-pages.html
 create mode 100644 webtool/templates/explorer/pagination.html

diff --git a/common/lib/dataset.py b/common/lib/dataset.py
index a48e6f053..dd7a96eec 100644
--- a/common/lib/dataset.py
+++ b/common/lib/dataset.py
@@ -15,7 +15,7 @@
 import backend
 from common.config_manager import config
 from common.lib.job import Job, JobNotFoundException
-from common.lib.helpers import get_software_commit, NullAwareTextIOWrapper, convert_to_int
+from common.lib.helpers import get_software_commit, NullAwareTextIOWrapper, convert_to_int, convert_to_float, flatten_dict
 from common.lib.item_mapping import MappedItem, MissingMappedField, DatasetItem
 from common.lib.fourcat_module import FourcatModule
 from common.lib.exceptions import (ProcessorInterruptedException, DataSetException, DataSetNotFoundException,
@@ -235,7 +235,7 @@ def log(self, log):
 		with log_path.open("a", encoding="utf-8") as outfile:
 			outfile.write("%s: %s\n" % (datetime.datetime.now().strftime("%c"), log))
 
-	def _iterate_items(self, processor=None):
+	def _iterate_items(self, processor=None, sort=None):
 		"""
 		A generator that iterates through a CSV or NDJSON file
 
@@ -268,6 +268,14 @@ def _iterate_items(self, processor=None):
 				wrapped_infile = NullAwareTextIOWrapper(infile, encoding="utf-8")
 				reader = csv.DictReader(wrapped_infile, **csv_parameters)
 
+				# In some cases, we want to sort the dataset first.
+				if sort:
+					# Generate reader on the basis of sort value
+					# At the moment, this is very inefficient, but
+					# suffices for the few cases where `sort` is used.
+					sort_by_index = next(reader).index(sort)
+					reader = sorted(reader, key=lambda x: convert_to_float(x[sort_by_index]) if len(x) >= sort_by_index else 0, reverse=True)
+
 				for item in reader:
 					if hasattr(processor, "interrupted") and processor.interrupted:
 						raise ProcessorInterruptedException("Processor interrupted while iterating through CSV file")
@@ -277,16 +285,28 @@ def _iterate_items(self, processor=None):
 		elif path.suffix.lower() == ".ndjson":
 			# In NDJSON format each line in the file is a self-contained JSON
 			with path.open(encoding="utf-8") as infile:
-				for line in infile:
-					if hasattr(processor, "interrupted") and processor.interrupted:
-						raise ProcessorInterruptedException("Processor interrupted while iterating through NDJSON file")
 
+				# Sorting can't be done easily here,
+				# we have to loop through the entire JSON first.
+				# Don't enable for large files!
+				if sort:
+					
+					for line in sorted([json.loads(line) for line in infile], key=lambda x: convert_to_float(flatten_dict(x)[sort]), reverse=True):
+						if hasattr(processor, "interrupted") and processor.interrupted:
+							raise ProcessorInterruptedException("Processor interrupted while iterating through NDJSON file")
+						yield line
+
+				else:
+					for line in infile:
+						if hasattr(processor, "interrupted") and processor.interrupted:
+							raise ProcessorInterruptedException("Processor interrupted while iterating through NDJSON file")
+					
 					yield json.loads(line)
 
 		else:
 			raise NotImplementedError("Cannot iterate through %s file" % path.suffix)
 
-	def iterate_items(self, processor=None, warn_unmappable=True, map_missing="default"):
+	def iterate_items(self, processor=None, warn_unmappable=True, map_missing="default", sort=None):
 		"""
 		Generate mapped dataset items
 
@@ -338,7 +358,7 @@ def iterate_items(self, processor=None, warn_unmappable=True, map_missing="defau
 			item_mapper = True
 
 		# Loop through items
-		for i, item in enumerate(self._iterate_items(processor)):
+		for i, item in enumerate(self._iterate_items(processor, sort=sort)):
 			# Save original to yield
 			original_item = item.copy()
 
diff --git a/common/lib/helpers.py b/common/lib/helpers.py
index d097e4b72..306c435a3 100644
--- a/common/lib/helpers.py
+++ b/common/lib/helpers.py
@@ -206,6 +206,22 @@ def convert_to_int(value, default=0):
     except (ValueError, TypeError):
         return default
 
+def convert_to_float(value, default=0):
+    """
+    Convert a value to a floating point, with a fallback
+
+    The fallback is used if an Error is thrown during converstion to float.
+    This is a convenience function, but beats putting try-catches everywhere
+    we're using user input as a floating point number.
+
+    :param value:  Value to convert
+    :param int default:  Default value, if conversion not possible
+    :return int:  Converted value
+    """
+    try:
+        return float(value)
+    except (ValueError, TypeError):
+        return default
 
 def timify_long(number):
     """
@@ -789,7 +805,7 @@ def flatten_dict(d: MutableMapping, parent_key: str = '', sep: str = '.'):
     Lists will be converted to json strings via json.dumps()
 
     :param MutableMapping d:  Dictionary like object
-    :param str partent_key: The original parent key prepending future nested keys
+    :param str parent_key: The original parent key prepending future nested keys
     :param str sep: A seperator string used to combine parent and child keys
     :return dict:  A new dictionary with the no nested values
     """
diff --git a/webtool/static/css/explorer/explorer-default.css b/webtool/static/css/explorer/explorer-default.css
index a3a60c0b2..562f150d9 100644
--- a/webtool/static/css/explorer/explorer-default.css
+++ b/webtool/static/css/explorer/explorer-default.css
@@ -30,138 +30,6 @@ See https://github.com/digitalmethodsinitiative/4cat/wiki/Exploring-and-annotati
     src: url("../fonts/OpenSans-BoldItalic.ttf")
 }
 
-/** --------------------- *
-     Basic HTML elements
-  * --------------------- */
-*, body, code, select, input, textarea {
-    font-family: 'Open Sans', 'Trebuchet MS', sans-serif;
-}
-
-body {
-    background-color: #f9fbff;
-    margin: 0;
-    padding: 0;
-}
-
-select, input, textarea {
-    background: var(--gray);
-    border: 1px solid var(--gray);
-    font-size: 14px;
-    padding: 0.25em;
-}
-
-label {
-    font-size: 14px;
-}
-
-button {
-    border: 2px solid var(--contrast-dark);
-    background: var(--contrast-bright);
-    color: var(--text);
-    border-radius: 0.5em;
-    font-size: 14px;
-    cursor: pointer;
-    padding: 0.25em 1em;
-}
-
-button:hover {
-    background: var(--accent);
-    color: var(--contrast-bright);
-}
-
-button.invalid, button.invalid:hover {
-    cursor: default;
-    background: var(--contrast-bright);
-    color: var(--text);
-    border-color: var(--gray-dark);
-}
-
-textarea {
-    width: 340px;
-}
-
-/** --------------------- *
-     Header
-  * --------------------- */
-
-body > header {
-    width: 100%;
-    margin: 0;
-    padding: 0;
-}
-
-body > header h1 {
-    box-shadow: 0 5px 10px #888;
-    margin: 0;
-    padding: 0;
-    font-size: 1.5em;
-    background: var(--contrast-dark);
-    color: var(--contrast-bright);
-    text-align: center;
-    font-size: 1.5em;
-    line-height: 1.5em;
-    font-weight: bold;
-    padding: 0.50em 0;
-    cursor: default;
-}
-
-body > header #metadata {
-    font-size: 16px;
-    min-width: 640px;
-    max-width: 960px;
-    margin: 0 auto;
-    margin-top: 40px;
-    margin-bottom: 40px;
-    text-align: center;
-}
-
-body > header #metadata #parameters > span {
-    font-family: monospace;
-    font-size: 12px;
-    display: inline-block;
-    background: white;
-    margin: 2px;
-    padding: 4px;
-    border: 1px solid black;
-    border-radius: 5px;
-    cursor: default;
-}
-
-body > header .return a {
-    position: absolute;
-    left: 0;
-    padding-left: 12px;
-    color: white;
-    font-size: 0.6em;
-    text-decoration: none;
-}
-
-#dataset-key {
-    display: none;
-}
-
-/** --------------------- *
-     Navigation pages
-  * --------------------- */
-.nav-pages {
-    text-align: center;
-}
-
-span.page {
-    display: inline-block;
-    padding: 10px;
-    min-width: 20px;
-    overflow: hidden;
-    color: black;
-    background-color: white;
-    font-family: monospace;
-    border: 1px solid black;
-}
-
-span.page.selected {
-    color: white;
-    background-color: black;
-}
 
 /** --------------------- *
      Posts
@@ -265,7 +133,7 @@ span.divider {
 /** --------------------- *
      Annotations editor
   * --------------------- */
-#annotations-editor-container {
+/*#annotations-editor-container {
     background: rgba(0, 0, 0, .4);
     display: none;
     height: 100%;
@@ -500,13 +368,4 @@ li.post.op > .post-annotations {
 }
 
 .posts .external-url {
-}
-
-/** --------------------- *
-     Footer
-  * --------------------- */
-footer {
-    text-align: center;
-    margin-top: 40px;
-    margin-bottom: 70px;
-}
\ No newline at end of file
+}*/
\ No newline at end of file
diff --git a/webtool/static/js/explorer.js b/webtool/static/js/explorer.js
index 1ee2acc9a..9f0e05a88 100644
--- a/webtool/static/js/explorer.js
+++ b/webtool/static/js/explorer.js
@@ -709,7 +709,7 @@ const annotations = {
 			error: function (error) {
 				annotations.enableSaving();
 				$("#save-annotations").html("<i class='fas fa-save'></i> Save annotations");
-				alert("Could't save annotations");
+				//alert("Could't save annotations");
 				console.log(error)
 			}
 		});
@@ -854,7 +854,9 @@ const page_functions = {
 				force_int = ""
 			}
 
-			window.location.href = getRelativeURL('explorer/dataset/' + $("#dataset-key").text() + "?sort=" + $(this).val() + sort_order + force_int);
+			let dataset_key = $("#dataset-key").text();
+			alert(dataset_key)
+			window.location.href = getRelativeURL("result/" + dataset_key + "/explorer/?sort=" + $(this).val() + sort_order + force_int);
 		});
 
 		// Change the dropdown sort option based on the URL parameter
diff --git a/webtool/templates/explorer/controls.html b/webtool/templates/explorer/controls.html
index f0bcd793d..aeaa5bebd 100644
--- a/webtool/templates/explorer/controls.html
+++ b/webtool/templates/explorer/controls.html
@@ -1,54 +1,43 @@
-	
-<article class="result" data-dataset-key="{{ dataset.key }}">
+	<article class="result" id="explorer-controls" data-dataset-key="{{ dataset.key }}">
 	<h2>
 		{{ dataset.get_label() }}
 	</h2>
 	<div class="fullwidth-notice">
-	<span class="return">
 		<a href="/results/{{ key }}"><i class="fas fa-arrow-left"></i> Return to dataset overview</a>
-	</span>
 	</div>
+	{% if custom_fields and custom_fields[0] == "invalid" %}
+	<div class="fullwidth-notice">
+		Invalid custom fields JSON - can't show posts properly ({{ custom_fields[1] }}).
+	</div>
+	{% endif %}
 	{% if key %}
-		<div id="dataset-info">
-			{% if post_count > max_posts %}
-				<p><strong>Large dataset - only showing the first {{ max_posts }} posts. Use filter processors to limit the dataset.</strong></p>
-				{% set post_count = max_posts %}
-			{% endif %}
-		</div>
-		<!-- <div id="parameters">
-		{% set primary_fields = ("datasource", "board", "body_match", "min_date", "max_date") %}
-		{% for field in primary_fields %}
-			<span>{{ field }}: {{ parameters[field] }}</span>
-		{% endfor%}
-		{% for other_field, value in parameters.items() %}
-			{% if other_field not in primary_fields and value %}
-				<span>{{ other_field }}: {{ value }}</span>
-			{% endif %}
-		{% endfor%}
-		</div> -->
-	{% elif thread %}
-		<p>Showing {{ post_count }} posts from {{ datasource }}/{{ board }} thread {{ thread }}.</p>
-		<p>Note that the archived posts may not be complete.</p>
+	<div id="dataset-info">
+		{% if post_count > max_posts %}
+			<p><strong>Large dataset - only showing the first {{ max_posts }} posts. Use filter processors to limit the dataset.</strong></p>
+			{% set post_count = max_posts %}
+		{% endif %}
+	</div>
 	{% endif %}
 
-</article>
+	{# some different info for views generated by a direct API call #}
+	{% if not key and has_database %}
+		<p>Showing {{ post_count }} posts from {{ datasource }}.</p>
+	{% else %}
+		<p>Showing posts {{ offset + 1 }} - {{ post_count if (offset + posts_per_page) > post_count else (offset + posts_per_page) }} ({{ post_count }} in total).</p>
+	{% endif %}
 
-	<p>Showing posts {{ offset + 1 }} - {{ post_count if (offset + posts_per_page) > post_count else (offset + posts_per_page) }} ({{ post_count }} in total).</p>
-{% if custom_fields and custom_fields[0] == "invalid" %}
-	<p><strong>Invalid custom fields JSON - can't show posts properly ({{ custom_fields[1] }}).</strong></p>
-{% endif %}
-{% if custom_fields and 'sort_options' in custom_fields %}
-<div id="sort-options">
-	<p>Sort posts by:
-		<select id="sort-select" name="sort-select">
-			<option class="sort-option" value="dataset-order">Dataset order</option>
-			
-			{% for sort_option in custom_fields['sort_options'] %}
 
-				<option class="sort-option" value="{{ sort_option.key }}" {% if "descending" in sort_option %}data-desc={{ sort_option.descending }}{% endif %} {% if "force_int" in sort_option %}data-force-int={{ sort_option.force_int }}{% endif %}>{{ sort_option.label }}</option>
-			
-			{% endfor %}
-		</select>
-	</p>
+<div id="sort-options">
+    <p>Sort posts by:
+        <select id="sort-select" name="sort-select">
+            <option class="sort-option" value="dataset-order">Dataset order</option>
+            {% set sort_options = [{"key": "ids","label": "ID"}] %}
+            {% for sort_option in sort_options %}
+                <option class="sort-option" value="{{ sort_option.key }}" {% if "descending" in sort_option %}data-desc={{ sort_option.descending }}{% endif %} {% if "force_int" in sort_option %}data-force-int={{ sort_option.force_int }}{% endif %}>{{ sort_option.label }}</option>
+            
+            {% endfor %}
+        </select>
+    </p>
 </div>
-{% endif %}
\ No newline at end of file
+
+</article>
diff --git a/webtool/templates/explorer/explorer.html b/webtool/templates/explorer/explorer.html
index 6cde4748b..4ae46f924 100644
--- a/webtool/templates/explorer/explorer.html
+++ b/webtool/templates/explorer/explorer.html
@@ -23,20 +23,25 @@
 	</style>
 {% endif %}
 
+<span id="dataset-key">{{ key }}</span>
+
 {% include "explorer/controls.html" %}
 
+{% include "explorer/pagination.html" %}
+
 {% if not thread %}
 	{% include "explorer/annotations.html" %}
 {% endif %}
 
+
 <div class="content">
-	{% include "explorer/nav-pages.html" %}
 	<ol class="posts" id="explorer-posts">
 	{% for post in posts %}
 		{% include "explorer/post.html" %}
 	{% endfor %}
 	</ol>
-	{% include "explorer/nav-pages.html" %}
 </div>
 
+{% include "explorer/pagination.html" %}
+
 {% endblock %}
\ No newline at end of file
diff --git a/webtool/templates/explorer/nav-pages.html b/webtool/templates/explorer/nav-pages.html
deleted file mode 100644
index b212f7d65..000000000
--- a/webtool/templates/explorer/nav-pages.html
+++ /dev/null
@@ -1,61 +0,0 @@
-<div class="nav-pages">
-	{% if post_count > max_posts %}
-		{% set post_count = max_posts %}
-	{% endif %}
-	{% set pages = ((post_count / posts_per_page) + (post_count % posts_per_page > 0))|int %}
-	{% set selected = "selected" %}
-	{% set lower_bound = 3 %}
-	{% set upper_bound = pages - 2 %}
-	{% if not page %}
-		{% set page = 1 %}
-	{% endif %}
-
-	{% set parameters = request.url | parameter_str %}
-
-	{# show all pages if it's 10 or less #}
-	{% if pages <= 10 %}
-		{% for i in range(pages) %}
-		{% set current_page = (i + 1) %}
-		<a href="/explorer/dataset/{{key}}/{{ current_page }}{{ parameters }}"><span class="page {{ selected if page == current_page }}">{{ current_page }}</span></a>
-		{% endfor %}
-
-	{# More complex formatting if there are more than 10 pages #}
-	{% else %}
-
-		{# Always display the lower three pages #}
-		{% for i in range(lower_bound) %}
-		{% set current_page = (i + 1) %}
-		<a href="/explorer/dataset/{{key}}/{{ current_page }}{{ parameters }}"><span class="page {{ selected if page == current_page }}">{{ current_page }}</span></a>
-		{% endfor %}
-
-		{# Show lower 'edge' pages #}
-		{% if lower_bound <= page <= lower_bound + 2 %}
-			{% for i in range(lower_bound, page + 1) %}
-			<a href="/explorer/dataset/{{key}}/{{ i + 1 }}{{ parameters }}"><span class="page {{ selected if i + 1 == page }}">{{ i + 1 }}</span></a>
-			{% endfor %}
-		{% endif %}
-		<span class="page">...</span>
-
-		{# Show middle pages #}
-		{% if (lower_bound + 2) < page < (upper_bound - 2)  %}
-			{% set current_page = page %}
-			{% for i in range((page - 1), (page + 2)) %}
-			<a href="/explorer/dataset/{{key}}/{{ i }}{{ parameters }}"><span class="page {{ selected if i == current_page }}">{{ i }}</span></a>
-			{% endfor %}
-			<span class="page">...</span>
-
-		{# Show upper 'edge' pages #}
-		{% elif upper_bound - 2 <= page <= upper_bound %}
-			{% for i in range(page - 1, upper_bound) %}
-			<a href="/explorer/dataset/{{key}}/{{ i - 1 }}{{ parameters }}"><span class="page {{ selected if i - 1 == page }}">{{ i - 1 }}</span></a>
-			{% endfor %}
-		{% endif %}
-
-
-		{# Always display the upper three pages #}
-		{% for i in range(lower_bound) %}
-		{% set current_page = upper_bound + i %}
-		<a href="/explorer/dataset/{{key}}/{{ current_page }}{{ parameters }}"><span class="page {{ selected if page == current_page }}">{{ current_page }}</span></a>
-		{% endfor %}
-	{% endif %}
-</div>
\ No newline at end of file
diff --git a/webtool/templates/explorer/pagination.html b/webtool/templates/explorer/pagination.html
new file mode 100644
index 000000000..ebf522890
--- /dev/null
+++ b/webtool/templates/explorer/pagination.html
@@ -0,0 +1,63 @@
+<nav class="pagination">
+	{% if post_count > max_posts %}
+		{% set post_count = max_posts %}
+	{% endif %}
+	{% set pages = ((post_count / posts_per_page) + (post_count % posts_per_page > 0))|int %}
+	{% set current = "current" %}
+	{% set lower_bound = 3 %}
+	{% set upper_bound = pages - 2 %}
+	{% if not page %}
+		{% set page = 1 %}
+	{% endif %}
+
+	{% set parameters = request.url | parameter_str %}
+
+	<ol>
+
+	{# show all pages if it's 10 or less #}
+	{% if pages <= 10 %}
+		{% for i in range(pages) %}
+		{% set current_page = (i + 1) %}
+		<li class="page">{% if page == current_page %}<strong class="current">{{ page }}</strong>{% else %}<a href="/results/{{key}}/explorer/page/{{ current_page }}{{ parameters }}">{{ current_page }}</a>{% endif %}</li>
+		{% endfor %}
+
+	{# More complex formatting if there are more than 10 pages #}
+	{% else %}
+
+		{# Always display the lower three pages #}
+		{% for i in range(lower_bound) %}
+		{% set current_page = (i + 1) %}
+		<li class="page">{% if page == current_page %}<strong class="current">{{ page }}</strong>{% else %}<a href="/results/{{key}}/explorer/page/{{ current_page }}{{ parameters }}">{{ current_page }}</a>{% endif %}</li>
+		{% endfor %}
+
+		{# Show lower 'edge' pages #}
+		{% if lower_bound <= page <= lower_bound + 2 %}
+			{% for i in range(lower_bound, page + 1) %}
+			<li class="page">{% if page == current_page %}<strong class="current">{{ page }}</strong>{% else %}"><a href="/results/{{key}}/explorer/page/{{ i + 1 }}{{ parameters }}">{{ i + 1 }}</a>{% endif %}</li>
+			{% endfor %}
+		{% endif %}
+		<li class="ellipsis">...</li>
+
+		{# Show middle pages #}
+		{% if (lower_bound + 2) < page < (upper_bound - 2)  %}
+			{% set current_page = page %}
+			{% for i in range((page - 1), (page + 2)) %}
+			<li class="page">{% if page == current_page %}<strong class="current">{{ page }}</strong>{% else %}<a href="/results/{{key}}/explorer/page/{{ i }}{{ parameters }}">{{ i }}</a>{% endif %}</li>
+			{% endfor %}
+			<li class="ellipsis">...</span>
+
+		{# Show upper 'edge' pages #}
+		{% elif upper_bound - 2 <= page <= upper_bound %}
+			{% for i in range(page - 1, upper_bound) %}
+			<li class="page">{% if page == current_page %}<strong class="current">{{ page }}</strong>{% else %}"><a href="/results/{{key}}/explorer/page/{{ i - 1 }}{{ parameters }}">{{ i - 1 }}</a>{% endif %}</li>
+			{% endfor %}
+		{% endif %}
+
+		{# Always display the upper three pages #}
+		{% for i in range(lower_bound) %}
+		{% set current_page = upper_bound + i %}
+		<li class="page">{% if page == current_page %}<strong class="current">{{ page }}</strong>{% else %}<a href="/results/{{key}}/explorer/page/{{ current_page }}{{ parameters }}">{{ current_page }}</a>{% endif %}</li>
+		{% endfor %}
+	{% endif %}
+	</ol>
+</nav>
\ No newline at end of file
diff --git a/webtool/views/views_explorer.py b/webtool/views/views_explorer.py
index 437b20f2c..9264745c5 100644
--- a/webtool/views/views_explorer.py
+++ b/webtool/views/views_explorer.py
@@ -28,13 +28,13 @@
 config = ConfigWrapper(config, user=current_user, request=request)
 api_ratelimit = limiter.shared_limit("45 per minute", scope="api")
 
-@app.route('/results/<string:key>/explorer/', defaults={'page': 0})
-@app.route('/results/<string:key>/explorer/<int:page>')
+@app.route('/result/<string:key>/explorer/', defaults={'page': 1})
+@app.route('/result/<string:key>/explorer/page/<int:page>')
 @api_ratelimit
 @login_required
 @setting_required("privileges.can_use_explorer")
 @openapi.endpoint("explorer")
-def explorer_dataset(key, page):
+def explorer_dataset(key, page=1):
 	"""
 	Show posts from a dataset
 
@@ -80,15 +80,14 @@ def explorer_dataset(key, page):
 	# The offset for posts depending on the current page
 	offset = ((page - 1) * posts_per_page) if page else 0
 
-
-	# f the dataset is generated from an API-accessible database, we can add 
+	# If the dataset is generated from an API-accessible database, we can add 
 	# extra features like the ability to navigate across posts.
-	has_database = False # CHANGE LATER /////////////////////
+	has_database = False # INTEGRATE LATER /////////////////////
 
 	# Check if we have to sort the data.
-	sort_by = request.args.get("sort")
-	if sort_by == "dataset-order":
-		sort_by = None
+	sort = request.args.get("sort")
+	if sort == "dataset-order":
+		sort = None
 
 	# Check if we have to reverse the order.
 	descending = request.args.get("desc")
@@ -109,8 +108,10 @@ def explorer_dataset(key, page):
 	posts = []
 	count = 0
 
+	sort = "id"
+
 	try:
-		for row in dataset.iterate_items(warn_unmappable=False):
+		for row in dataset.iterate_items(warn_unmappable=False, sort=sort):
 
 			count += 1
 
@@ -164,15 +165,14 @@ def explorer_dataset(key, page):
 	# Generate the HTML page
 	return render_template("explorer/explorer.html", key=key, datasource=datasource, has_database=has_database, parameters=parameters, annotation_fields=annotation_fields, annotations=annotations, dataset=dataset, posts=posts, custom_css=css, custom_fields=custom_fields, page=page, offset=offset, posts_per_page=posts_per_page, post_count=post_count, max_posts=max_posts)
 
-@app.route('/explorer/thread/<datasource>/<board>/<string:thread_id>')
+@app.route('/result/<datasource>/<string:thread_id>/explorer')
 @api_ratelimit
 @login_required
 @setting_required("privileges.can_use_explorer")
 @openapi.endpoint("explorer")
-def explorer_local_thread(datasource, board, thread_id):
+def explorer_database_thread(datasource, board, thread_id):
 	"""
-	Show a thread. This is only available for local data sources,
-	and will be depracated/changed in future updates.
+	Show a thread from an API-accessible database.
 
 	:param str datasource:  Data source ID
 	:param str board:  Board name
@@ -217,11 +217,9 @@ def explorer_local_thread(datasource, board, thread_id):
 @login_required
 @setting_required("privileges.can_use_explorer")
 @openapi.endpoint("explorer")
-def explorer_local_posts(datasource, board, thread_id):
+def explorer_database_posts(datasource, board, thread_id):
 	"""
-	Show a posts from a local data source.
-	This is only available for local data sources,
-	and will be depracated/changed in future updates.
+	Show posts from an API-accessible database.
 
 	:param str datasource:  Data source ID
 	:param str board:  Board name
@@ -535,34 +533,25 @@ def iterate_items_with_sort(in_file, max_rows=None, sort_by=None, descending=Fal
 
 	suffix = in_file.name.split(".")[-1].lower()
 
-	if suffix == "csv":
-
-		with open(in_file, "r", encoding="utf-8") as dataset_file:
-
-			# Sort on date by default
-			# Unix timestamp integers are not always saved in the same field.
-			reader = csv.reader(dataset_file)
-			columns = next(reader)
-			if sort_by:
-				try:
-					# Get index number of sort_by value
-					sort_by_index = columns.index(sort_by)
-
-					# Generate reader on the basis of sort_by value
-					reader = sorted(reader, key=lambda x: to_float(x[sort_by_index], convert=force_int) if len(x) >= sort_by_index else 0, reverse=descending)
+	# Sort on data date by default
+	# Unix timestamp integers are not always saved in the same field.
+	reader = csv.reader(dataset_file)
+	columns = next(reader)
+	if sort_by:
+		try:
+			print("YES")
+		except (ValueError, IndexError) as e:
+			pass
 
-				except (ValueError, IndexError) as e:
-					pass
+	for item in reader:
 
-			for item in reader:
+		# Add columns
+		#item = {columns[i]: item[i] for i in range(len(item))}
 
-				# Add columns
-				item = {columns[i]: item[i] for i in range(len(item))}
-
-				yield item
-
-	elif suffix == "ndjson":
+		yield item
 
+	if suffix == "ndjson":
+		print("TRUEE")
 		# In this format each line in the file is a self-contained JSON
 		# file
 		with open(in_file, "r", encoding="utf-8") as dataset_file:
@@ -577,13 +566,7 @@ def iterate_items_with_sort(in_file, max_rows=None, sort_by=None, descending=Fal
 			# If a sort order is given explicitly, we're sorting anyway.
 			else:
 				keys = sort_by.split(".")
-
-				if max_rows:
-					for item in sorted([json.loads(line) for i, line in enumerate(dataset_file) if i < max_rows], key=lambda x: to_float(get_nested_value(x, keys), convert=force_int), reverse=descending):
-							yield item
-				else:
-					for item in sorted([json.loads(line) for line in dataset_file], key=lambda x: to_float(get_nested_value(x, keys), convert=force_int), reverse=descending):
-							yield item
+				yield item
 
 	return Exception("Can't loop through file with extension %s" % suffix)
 
@@ -689,14 +672,6 @@ def get_nested_value(di, keys):
 			return 0
 	return di
 
-def to_float(value, convert=False):
-	if convert:
-		if not value:
-			value = 0
-		else:
-			value = float(value)
-	return value
-
 def strip_html(post):
 	post["body"] = strip_tags(post.get("body", ""))
 	return post

From e93736233546b446c2accd47d8307710762d4e7c Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Wed, 17 Apr 2024 12:49:36 +0200
Subject: [PATCH 016/204] More sorting stuff

---
 common/lib/dataset.py                         |  33 ++----
 .../static/css/explorer/explorer-default.css  |   4 +
 webtool/templates/explorer/controls.html      |  14 ++-
 webtool/templates/explorer/explorer.html      |   2 +-
 webtool/views/views_explorer.py               | 103 +++++++-----------
 5 files changed, 63 insertions(+), 93 deletions(-)

diff --git a/common/lib/dataset.py b/common/lib/dataset.py
index dd7a96eec..5eb49c37c 100644
--- a/common/lib/dataset.py
+++ b/common/lib/dataset.py
@@ -235,7 +235,7 @@ def log(self, log):
 		with log_path.open("a", encoding="utf-8") as outfile:
 			outfile.write("%s: %s\n" % (datetime.datetime.now().strftime("%c"), log))
 
-	def _iterate_items(self, processor=None, sort=None):
+	def _iterate_items(self, processor=None):
 		"""
 		A generator that iterates through a CSV or NDJSON file
 
@@ -268,14 +268,6 @@ def _iterate_items(self, processor=None, sort=None):
 				wrapped_infile = NullAwareTextIOWrapper(infile, encoding="utf-8")
 				reader = csv.DictReader(wrapped_infile, **csv_parameters)
 
-				# In some cases, we want to sort the dataset first.
-				if sort:
-					# Generate reader on the basis of sort value
-					# At the moment, this is very inefficient, but
-					# suffices for the few cases where `sort` is used.
-					sort_by_index = next(reader).index(sort)
-					reader = sorted(reader, key=lambda x: convert_to_float(x[sort_by_index]) if len(x) >= sort_by_index else 0, reverse=True)
-
 				for item in reader:
 					if hasattr(processor, "interrupted") and processor.interrupted:
 						raise ProcessorInterruptedException("Processor interrupted while iterating through CSV file")
@@ -283,30 +275,19 @@ def _iterate_items(self, processor=None, sort=None):
 					yield item
 
 		elif path.suffix.lower() == ".ndjson":
-			# In NDJSON format each line in the file is a self-contained JSON
+
 			with path.open(encoding="utf-8") as infile:
 
-				# Sorting can't be done easily here,
-				# we have to loop through the entire JSON first.
-				# Don't enable for large files!
-				if sort:
-					
-					for line in sorted([json.loads(line) for line in infile], key=lambda x: convert_to_float(flatten_dict(x)[sort]), reverse=True):
-						if hasattr(processor, "interrupted") and processor.interrupted:
-							raise ProcessorInterruptedException("Processor interrupted while iterating through NDJSON file")
-						yield line
+				for line in infile:
+					if hasattr(processor, "interrupted") and processor.interrupted:
+						raise ProcessorInterruptedException("Processor interrupted while iterating through NDJSON file")
 
-				else:
-					for line in infile:
-						if hasattr(processor, "interrupted") and processor.interrupted:
-							raise ProcessorInterruptedException("Processor interrupted while iterating through NDJSON file")
-					
 					yield json.loads(line)
 
 		else:
 			raise NotImplementedError("Cannot iterate through %s file" % path.suffix)
 
-	def iterate_items(self, processor=None, warn_unmappable=True, map_missing="default", sort=None):
+	def iterate_items(self, processor=None, warn_unmappable=True, map_missing="default"):
 		"""
 		Generate mapped dataset items
 
@@ -358,7 +339,7 @@ def iterate_items(self, processor=None, warn_unmappable=True, map_missing="defau
 			item_mapper = True
 
 		# Loop through items
-		for i, item in enumerate(self._iterate_items(processor, sort=sort)):
+		for i, item in enumerate(self._iterate_items(processor)):
 			# Save original to yield
 			original_item = item.copy()
 
diff --git a/webtool/static/css/explorer/explorer-default.css b/webtool/static/css/explorer/explorer-default.css
index 562f150d9..f726d741c 100644
--- a/webtool/static/css/explorer/explorer-default.css
+++ b/webtool/static/css/explorer/explorer-default.css
@@ -133,6 +133,10 @@ span.divider {
 /** --------------------- *
      Annotations editor
   * --------------------- */
+#annotations-editor-container {
+    display: hidden;
+}
+
 /*#annotations-editor-container {
     background: rgba(0, 0, 0, .4);
     display: none;
diff --git a/webtool/templates/explorer/controls.html b/webtool/templates/explorer/controls.html
index aeaa5bebd..3ab99c3b0 100644
--- a/webtool/templates/explorer/controls.html
+++ b/webtool/templates/explorer/controls.html
@@ -31,11 +31,15 @@ <h2>
     <p>Sort posts by:
         <select id="sort-select" name="sort-select">
             <option class="sort-option" value="dataset-order">Dataset order</option>
-            {% set sort_options = [{"key": "ids","label": "ID"}] %}
-            {% for sort_option in sort_options %}
-                <option class="sort-option" value="{{ sort_option.key }}" {% if "descending" in sort_option %}data-desc={{ sort_option.descending }}{% endif %} {% if "force_int" in sort_option %}data-force-int={{ sort_option.force_int }}{% endif %}>{{ sort_option.label }}</option>
-            
-            {% endfor %}
+            {% if not custom_fields %}
+	            {% for column in dataset.get_columns() %}
+	            	<option class="sort-option" value="{{ column }}">{{ column }}</option>
+	            {% endfor %}
+            {% else %}
+            	{% for sort_option in sort_options %}
+                	<option class="sort-option" value="{{ sort_option.value() }}" {% if "descending" in sort_option %}data-desc={{ sort_option.descending }}{% endif %} {% if "force_int" in sort_option %}data-force-int={{ sort_option.force_int }}{% endif %}>{{ sort_option.key() }}</option>
+            	{% endfor %}
+            {% endif %}
         </select>
     </p>
 </div>
diff --git a/webtool/templates/explorer/explorer.html b/webtool/templates/explorer/explorer.html
index 4ae46f924..4ab1fd206 100644
--- a/webtool/templates/explorer/explorer.html
+++ b/webtool/templates/explorer/explorer.html
@@ -6,7 +6,7 @@
 
 {% block body %}
 
-<link rel="stylesheet" type="text/css" href="{{url_for('static', filename='css/explorer.css')}}">
+<!-- <link rel="stylesheet" type="text/css" href="{{url_for('static', filename='css/explorer.css')}}"> -->
 <link rel="stylesheet" type="text/css" href="{{url_for('static', filename='css/flags.css')}}">
 
 <script type="text/javascript" src="{{url_for('static', filename='js/explorer.js')}}"></script>
diff --git a/webtool/views/views_explorer.py b/webtool/views/views_explorer.py
index 9264745c5..576c1e484 100644
--- a/webtool/views/views_explorer.py
+++ b/webtool/views/views_explorer.py
@@ -90,28 +90,20 @@ def explorer_dataset(key, page=1):
 		sort = None
 
 	# Check if we have to reverse the order.
-	descending = request.args.get("desc")
-	if descending == "true" or descending == True:
-		descending = True
-	else:
-		descending = False
+	reverse = True if request.args.get("desc") in ("true", True) else False
 
 	# Check if we have to convert the sort value to an integer.
-	force_int = request.args.get("int")
-	if force_int == "true" or force_int == True:
-		force_int = True
-	else:
-		force_int = False
+	force_number = True if request.args.get("int") in ("true", True) else False
 
 	# Load posts
 	post_ids = []
 	posts = []
 	count = 0
 
-	sort = "id"
-
-	try:
-		for row in dataset.iterate_items(warn_unmappable=False, sort=sort):
+	# If we're sorting, we need to iterate over the entire
+	# dataset first. Else we can simply use `iterate_items`.
+	if not sort:
+		for row in dataset.iterate_items(warn_unmappable=False):
 
 			count += 1
 
@@ -126,9 +118,15 @@ def explorer_dataset(key, page=1):
 			# Stop if we exceed the allowed posts per page or max posts.
 			if count >= (offset + posts_per_page) or count > max_posts:
 				break
-
-	except NotImplementedError:
-		return error(404)
+	else:
+		for row in sort_and_iterate_items(dataset, sort, reverse=reverse, warn_unmappable=False):
+			count += 1
+			if count <= offset:
+				continue
+			post_ids.append(row["id"])
+			posts.append(row)
+			if count >= (offset + posts_per_page) or count > max_posts:
+				break
 
 	# Retrieve custom CSS if it is present in the datasource's config.
 	# If not given, we use a standard template. This standard CSS template
@@ -238,7 +236,7 @@ def explorer_database_posts(datasource, board, thread_id):
 		return error(404, error="No thread ID provided")
 
 	# Get the posts with this thread ID.
-	posts = get_local_posts(db, datasource, board=board, ids=tuple([thread_id]), threads=True, order_by=["id"])
+	posts = get_database_posts(db, datasource, board=board, ids=tuple([thread_id]), threads=True, order_by=["id"])
 
 	posts = [strip_html(post) for post in posts]
 	posts = [format(post) for post in posts]
@@ -521,58 +519,41 @@ def get_image_file(img_file):
 
 	return send_file(str(image_path))
 
-def iterate_items_with_sort(in_file, max_rows=None, sort_by=None, descending=False, force_int=False):
+def sort_and_iterate_items(dataset, sort=None, reverse=False, force_number=False, **kwargs):
 	"""
 	Loop through both csv and NDJSON files.
-	:param in_file, str:		The input file to read.
-	:param sort_by, str:		The key that determines the sort order.
-	:param descending, bool:	Whether to sort by descending values.
-	:param force_int, bool:		Whether the sort value should be converted to an
-								integer.
+	This is basically a wrapper function for `iterate_items()` with the
+	added functionality of sorting a dataset. Because the Explorer is (currently)
+	the only feature that requires sorting, we define it here.
+	This first iterates through the entire file (with a max limit) to determine
+	an order. Then it yields items based on this order.
+
+	:param key, str:			The dataset object.
+	:param sort_by, str:		The item key that determines the sort order.
+	:param reverse, bool:		Whether to sort by largest values first.
+	:param force_number, bool:	Whether the sort value should be converted to a
+								floating point number.
 	"""
 
-	suffix = in_file.name.split(".")[-1].lower()
-
-	# Sort on data date by default
-	# Unix timestamp integers are not always saved in the same field.
-	reader = csv.reader(dataset_file)
-	columns = next(reader)
-	if sort_by:
-		try:
-			print("YES")
-		except (ValueError, IndexError) as e:
-			pass
-
-	for item in reader:
-
-		# Add columns
-		#item = {columns[i]: item[i] for i in range(len(item))}
-
-		yield item
-
-	if suffix == "ndjson":
-		print("TRUEE")
-		# In this format each line in the file is a self-contained JSON
-		# file
-		with open(in_file, "r", encoding="utf-8") as dataset_file:
+	# Storing posts in the right order here
+	posts = []
 
-			# Unfortunately we can't easily sort here.
-			# We're just looping through the file if no sort is given.
-			if not sort_by:
-				for line in dataset_file:
-					item = json.loads(line)
-					yield item
+	# Generate reader on the basis of sort value
+	# At the moment, this is very inefficient, but
+	# suffices for the few cases where `sort` is used.
+	#sort_by_index = next(reader).index(sort)
+	#reader = sorted(reader, key=lambda x: convert_to_float(x[sort_by_index]) if len(x) >= sort_by_index else 0, reverse=True)
+	#sorted([json.loads(line) for line in infile], key=lambda x: convert_to_float(flatten_dict(x)[sort]), reverse=True)
 
-			# If a sort order is given explicitly, we're sorting anyway.
-			else:
-				keys = sort_by.split(".")
-				yield item
+	for item in sorted(dataset.iterate_items(**kwargs), key=lambda x: x[sort]):
+		posts.append(item)
 
-	return Exception("Can't loop through file with extension %s" % suffix)
+	for post in posts:
+		yield post
 
-def get_local_posts(db, datasource, ids, board="", threads=False, limit=0, offset=0, order_by=["timestamp"]):
+def get_database_posts(db, datasource, ids, board="", threads=False, limit=0, offset=0, order_by=["timestamp"]):
 	"""
-	Retrieve posts from a local data source based on post IDs.
+	Retrieve posts by ID from a database-accessible data source.
 	"""
 
 	if not ids:

From 11eaaf945fa6205077ec736ba688027c01f5a0f4 Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Wed, 17 Apr 2024 17:46:23 +0200
Subject: [PATCH 017/204] Fix and simplify sorting, control box styling

---
 webtool/static/css/dataset-page.css           |  7 ++
 webtool/static/js/explorer.js                 | 75 ++++++++----------
 .../explorer/annotations-editor.html          | 55 ++++++++++++++
 webtool/templates/explorer/annotations.html   | 69 -----------------
 webtool/templates/explorer/controls.html      | 76 +++++++++++--------
 webtool/templates/explorer/explorer.html      |  7 +-
 webtool/views/views_explorer.py               | 51 ++++++-------
 7 files changed, 162 insertions(+), 178 deletions(-)
 create mode 100644 webtool/templates/explorer/annotations-editor.html
 delete mode 100644 webtool/templates/explorer/annotations.html

diff --git a/webtool/static/css/dataset-page.css b/webtool/static/css/dataset-page.css
index 1c7b908da..e82742bda 100644
--- a/webtool/static/css/dataset-page.css
+++ b/webtool/static/css/dataset-page.css
@@ -198,6 +198,12 @@ article.result > section:first-child {
     line-height: 1.3em;
 }
 
+.button-like-small.disabled {
+    cursor: not-allowed;
+    opacity: 0.5;
+}
+
+
 .dataset-owner-list li {
     display: inline-block;
 }
@@ -225,6 +231,7 @@ article.result > section:first-child {
     background: var(--gray-light);
     border: 1px solid var(--gray-dark);
     font-size: 0.8em;
+    cursor: pointer;
 }
 
 .dataset-toolbox a:hover, a.button-like-small:hover {
diff --git a/webtool/static/js/explorer.js b/webtool/static/js/explorer.js
index 9f0e05a88..8c45e9b7e 100644
--- a/webtool/static/js/explorer.js
+++ b/webtool/static/js/explorer.js
@@ -37,7 +37,15 @@ const annotations = {
 		$("#add-annotation-field").on("click", function(){$("#annotation-fields").append(annotations.getAnnotationsDiv);});
 
 		// Show and hide the annotations editor
-		$("#toggle-annotation-fields").on("click", function(){$("#annotations-editor-container").toggle();});
+		$("#toggle-annotation-fields").on("click", function(){
+			$("#annotations-editor-container").toggle();
+			if ($("#annotation-controls-buttons").hasClass("hidden")) {
+				$(this).html("<i class='fas fa-chevron-up'>");
+			}
+			else {
+				$(this).html("<i class='fas fa-chevron-left'>");
+			}
+		});
 		$("#annotations-editor").click(function(e) {
 		   e.stopPropagation();
 		});
@@ -599,8 +607,7 @@ const annotations = {
 				// If the query is accepted by the server.
 				if (response == 'success') {
 					$("#annotations-editor-container").hide();
-					$("#save-annotation-fields").addClass("invalid")
-					$("#save-annotation-fields").prop("disabled", true);
+					$("#save-annotation-fields").addClass("disabled");
 				}
 
 				// If the query is rejected by the server.
@@ -695,7 +702,7 @@ const annotations = {
 
 					annotations.enableSaving();
 					$("#save-annotations").html("<i class='fas fa-save'></i> Annotations saved");
-					$("#save-annotations").addClass("invalid").prop("disabled", true);
+					$("#save-annotations").addClass("disabled");
 					old_annotation_fields = $("#annotation-fields").html();
 					// alert(alert_message);
 				}
@@ -746,24 +753,24 @@ const annotations = {
 		// So we just need to check whether they're there.
 
 		if (Object.keys(annotation_fields).length < 1) {
-			$("#toggle-annotations").addClass("invalid");
+			$("#toggle-annotations").addClass("disabled");
 			return false;
 		}
 		else {
-			$("#toggle-annotations").removeClass("invalid");
+			$("#toggle-annotations").removeClass("disabled");
 			return true;
 		}
 	},
 
 	enableSaving: function(){
 		// Enable saving annotations to the database
-		$("#save-annotations, #save-to-dataset").removeClass("invalid").removeAttr("disabled");
+		$("#save-annotations, #save-to-dataset").removeClass("disabled");
 		$("#save-annotations").html("<i class='fas fa-save'></i> Save annotations");
 	},
 
 	disableSaving: function(){
 		// Disable saving annotations to the database
-		$("#save-annotations, #save-to-dataset").addClass("invalid").prop("disabled", true);
+		$("#save-annotations, #save-to-dataset").addClass("disabled");
 	},
 
 	warnEditor: function(warning) {
@@ -778,13 +785,13 @@ const annotations = {
 
 	toggleAnnotations: function() {
 		let ta = $("#toggle-annotations");
-		if (ta.hasClass("hidden")) {
-			ta.removeClass("hidden");
+		if (ta.hasClass("shown")) {
+			ta.removeClass("shown");
 			ta.html("<i class='fas fa-eye-slash'></i> Hide annotations");
 			$(".post-annotations").show(200);
 		}
 		else {
-			ta.addClass("hidden");
+			ta.addClass("shown");
 			ta.html("<i class='fas fa-eye'></i> Show annotations");
 			$(".post-annotations").hide(200);
 		}
@@ -832,51 +839,29 @@ const page_functions = {
 		}));
 
 		// Reorder the dataset when the sort type is changed
-		$("#sort-select").on("change", function(){
+		$(".sort-select").on("change", function(){
 			
-			let selected = $(this).find("option:selected");
-
-			// Pass whether the order should be reversed or not
-			let sort_order = selected.data("desc");	
-			if (sort_order){
-				sort_order = "&desc=true"
-			}
-			else {
-				sort_order = ""
-			}
+			// Get the column to sort on, an whether we should sort in reverse.
+			let selected = $("#column-sort-select").find("option:selected").val();
+			let order = $("#column-sort-order").find("option:selected").val();
 
-			// Pass whether we should treat this value as an integer
-			let force_int = selected.data("force-int");	
-			if (force_int){
-				force_int = "&int=true"
-			}
-			else {
-				force_int = ""
+			sort_order = ""
+			if (order == "reverse"){
+				sort_order = "&order=reverse"
 			}
 
 			let dataset_key = $("#dataset-key").text();
-			alert(dataset_key)
-			window.location.href = getRelativeURL("result/" + dataset_key + "/explorer/?sort=" + $(this).val() + sort_order + force_int);
+			window.location.href = getRelativeURL("results/" + dataset_key + "/explorer/?sort=" + selected + sort_order);
 		});
 
 		// Change the dropdown sort option based on the URL parameter
 		let searchParams = new URLSearchParams(window.location.search)
-		let sort_order = searchParams.get("sort");
-		let desc = searchParams.get("desc");
-
+		let selected = searchParams.get("sort");
+		let sort_order = searchParams.get("order");
+		$("#column-sort-select").find("option[value='" + selected + "']").attr("selected", "selected");
 		if (sort_order) {
-			// There can be multiple options with the same key since
-			// one of them might be reversed and the other not (e.g. 
-			// timestamps sorted by new to old and vice versa).
-			// So select the sort order with the right desc attribute.
-			if (desc == "true") {
-				$("#sort-select").find("option[value='" + sort_order + "'][data-desc='True']").attr("selected", "selected");
-			}
-			else {
-				$("#sort-select").val(sort_order);
-			}
+			$("#column-sort-order").find("option[value='" + sort_order + "']").attr("selected", "selected");
 		}
-
 	}
 };
 
diff --git a/webtool/templates/explorer/annotations-editor.html b/webtool/templates/explorer/annotations-editor.html
new file mode 100644
index 000000000..71c7dc5f3
--- /dev/null
+++ b/webtool/templates/explorer/annotations-editor.html
@@ -0,0 +1,55 @@
+<div id="annotations-editor" class="fullwidth saved">
+	<span class="helper"></span>
+
+	<div id="annotation-fields-container">
+		<div class="annotation-headers">
+			<div class="annotation-header" id="ah-label">Label</div>
+			<div class="annotation-header" id="ah-type">Input type</div>
+			<div class="annotation-header" id="ah-options">Options</div>
+		</div>
+
+		<div id="annotation-fields">
+		{% if annotation_fields %}
+			
+		{% for field in annotation_fields %}
+
+			{% set annotation_type = annotation_fields[field]["type"] %}
+			{% set label = annotation_fields[field]["label"] %}
+			<div class="annotation-fields-row annotation-field" id="field-{{ field }}">
+			<input type="text" class="annotation-field-label" name="annotation-field-label" placeholder="Field label" value="{{ label }}">
+			<button class="delete-input"><i class="fas fa-trash"></i></button>
+			<select name="annotation-field-type" class="annotation-field-type">
+				<option class="annotation-field-option" value="text" {% if annotation_type == "text" %}selected{% endif %}>Text</option>
+				<option class="annotation-field-option" value="textarea" {% if annotation_type == "textarea" %}selected{% endif %}>Textarea</option>
+				<option class="annotation-field-option" value="dropdown" {% if annotation_type == "dropdown" %}selected{% endif %}>Dropdown</option>
+				<option class="annotation-field-option" value="checkbox" {% if annotation_type == "checkbox" %}selected{% endif %}>Checkbox</option>
+			</select>
+
+			{% if annotation_type == "dropdown" or annotation_type == "checkbox" %}
+				<div class="option-fields">
+				{% for option in annotation_fields[field]["options"] %}
+					{% set option_id = option.keys() | first %}
+					{% set option_label = option.values() | first %}
+					<div class="option-field">
+						<input type="text" id="input-{{ option_id }}" value="{{ option_label }}">
+						<button class="delete-option-field"><i class="fas fa-trash"></i></button>
+					</div>
+				{% endfor %}
+					<div class="option-field">
+						<input type="text" id="input-{{ range(1, 100000000) | random }}" placeholder="Value">
+						
+					</div>
+				</div>
+			{% endif %}
+			</div>
+		{% endfor %}
+		{% endif %}
+		</div>
+	</div>
+	<div id="add-annotation-fields">
+		<span id="annotations-input-warning" class="hidden"></span>
+		<button id="add-annotation-field"><i class="fas fa-plus"></i> New field</button>
+		<button id="save-annotation-fields" class="invalid" disabled><i class="fas fa-save"></i> Apply</button>
+	<p id="notice"><em>Note: Changing input types will overwrite existing annotations for the field</em></p>
+	</div>
+</div>
\ No newline at end of file
diff --git a/webtool/templates/explorer/annotations.html b/webtool/templates/explorer/annotations.html
deleted file mode 100644
index fbb0b89bb..000000000
--- a/webtool/templates/explorer/annotations.html
+++ /dev/null
@@ -1,69 +0,0 @@
-<div id="annotations-editor-container">
-	<div id="annotations-editor" class="saved">
-    <span class="helper"></span>
-		<div id="close-annotation-fields">&times;</div>
-
-		<div id="annotation-fields-container">
-			<div class="annotation-headers">
-				<div class="annotation-header" id="ah-label">Label</div>
-				<div class="annotation-header" id="ah-type">Input type</div>
-				<div class="annotation-header" id="ah-options">Options</div>
-			</div>
-
-			<div id="annotation-fields">
-			{% if annotation_fields %}
-				
-			{% for field in annotation_fields %}
-
-				{% set annotation_type = annotation_fields[field]["type"] %}
-				{% set label = annotation_fields[field]["label"] %}
-				<div class="annotation-fields-row annotation-field" id="field-{{ field }}">
-				<input type="text" class="annotation-field-label" name="annotation-field-label" placeholder="Field label" value="{{ label }}">
-				<button class="delete-input"><i class="fas fa-trash"></i></button>
-				<select name="annotation-field-type" class="annotation-field-type">
-					<option class="annotation-field-option" value="text" {% if annotation_type == "text" %}selected{% endif %}>Text</option>
-					<option class="annotation-field-option" value="textarea" {% if annotation_type == "textarea" %}selected{% endif %}>Textarea</option>
-					<option class="annotation-field-option" value="dropdown" {% if annotation_type == "dropdown" %}selected{% endif %}>Dropdown</option>
-					<option class="annotation-field-option" value="checkbox" {% if annotation_type == "checkbox" %}selected{% endif %}>Checkbox</option>
-				</select>
-
-				{% if annotation_type == "dropdown" or annotation_type == "checkbox" %}
-					<div class="option-fields">
-					{% for option in annotation_fields[field]["options"] %}
-						{% set option_id = option.keys() | first %}
-						{% set option_label = option.values() | first %}
-						<div class="option-field">
-							<input type="text" id="input-{{ option_id }}" value="{{ option_label }}">
-							<button class="delete-option-field"><i class="fas fa-trash"></i></button>
-						</div>
-					{% endfor %}
-						<div class="option-field">
-							<input type="text" id="input-{{ range(1, 100000000) | random }}" placeholder="Value">
-							
-						</div>
-					</div>
-				{% endif %}
-				</div>
-			{% endfor %}
-			{% endif %}
-			</div>
-		</div>
-		<div id="add-annotation-fields">
-			<span id="annotations-input-warning" class="hidden"></span>
-			<button id="add-annotation-field"><i class="fas fa-plus"></i> New field</button>
-			<button id="save-annotation-fields" class="invalid" disabled><i class="fas fa-save"></i> Apply</button>
-		<p id="notice"><em>Note: Changing input types will overwrite existing annotations for the field</em></p>
-		</div>
-	</div>
-</div>
-
-<div id="annotation-controls">
-	<div id="annotation-controls-buttons" class="">
-		<button id="toggle-annotation-fields"><i class="fas fa-edit"></i> Edit fields</button>
-		<button id="toggle-annotations" class="invalid hidden"><i class="fas fa-eye"></i> Show annotations</button>
-		<span class="divider">|</span>
-		<button id="save-annotations" class="invalid" disabled><i class="fas fa-save"></i> Save annotations</button>
-		<button id="save-to-dataset" class="invalid" disabled><i class="fa-solid fa-right-from-bracket"></i> Write to dataset</button>
-	</div>
-	<span id="toggle-annotation-controls"><i class="fas fa-chevron-left"></i></span>
-</div>
\ No newline at end of file
diff --git a/webtool/templates/explorer/controls.html b/webtool/templates/explorer/controls.html
index 3ab99c3b0..3dc77990d 100644
--- a/webtool/templates/explorer/controls.html
+++ b/webtool/templates/explorer/controls.html
@@ -5,43 +5,57 @@ <h2>
 	<div class="fullwidth-notice">
 		<a href="/results/{{ key }}"><i class="fas fa-arrow-left"></i> Return to dataset overview</a>
 	</div>
+	<span style="display: none" id="dataset-key">{{ key }}</span>
 	{% if custom_fields and custom_fields[0] == "invalid" %}
 	<div class="fullwidth-notice">
 		Invalid custom fields JSON - can't show posts properly ({{ custom_fields[1] }}).
 	</div>
 	{% endif %}
-	{% if key %}
-	<div id="dataset-info">
-		{% if post_count > max_posts %}
-			<p><strong>Large dataset - only showing the first {{ max_posts }} posts. Use filter processors to limit the dataset.</strong></p>
-			{% set post_count = max_posts %}
-		{% endif %}
-	</div>
+	{% if key and post_count > max_posts %}
+	<div class="fullwidth-notice">Large dataset - only showing the first {{ max_posts }} posts. Use filter processors to limit the dataset.</div>
+	{% set post_count = max_posts %}
 	{% endif %}
 
-	{# some different info for views generated by a direct API call #}
-	{% if not key and has_database %}
-		<p>Showing {{ post_count }} posts from {{ datasource }}.</p>
-	{% else %}
-		<p>Showing posts {{ offset + 1 }} - {{ post_count if (offset + posts_per_page) > post_count else (offset + posts_per_page) }} ({{ post_count }} in total).</p>
-	{% endif %}
-
-
-<div id="sort-options">
-    <p>Sort posts by:
-        <select id="sort-select" name="sort-select">
-            <option class="sort-option" value="dataset-order">Dataset order</option>
-            {% if not custom_fields %}
-	            {% for column in dataset.get_columns() %}
-	            	<option class="sort-option" value="{{ column }}">{{ column }}</option>
-	            {% endfor %}
-            {% else %}
-            	{% for sort_option in sort_options %}
-                	<option class="sort-option" value="{{ sort_option.value() }}" {% if "descending" in sort_option %}data-desc={{ sort_option.descending }}{% endif %} {% if "force_int" in sort_option %}data-force-int={{ sort_option.force_int }}{% endif %}>{{ sort_option.key() }}</option>
-            	{% endfor %}
-            {% endif %}
-        </select>
-    </p>
-</div>
+	<div class="fullwidth-notice">
+		{% if not key and has_database %}
+			Showing {{ post_count }} posts from {{ datasource }}.
+		{% else %}
+			Showing posts {{ offset + 1 }} - {{ post_count if (offset + posts_per_page) > post_count else (offset + posts_per_page) }} ({{ post_count }} in total).
+		{% endif %}
+	</div>
+	<nav class="dataset-toolbox">
+	</nav>
+	<div class="card-content">
+		<div class="metadata-wrapper">
+			<div class="fullwidth" id="annotation-options">
+				<dt>Annotations</dt>
+				<dd>
+					<a class="button-like-small" id="toggle-annotation-fields"><i class="fas fa-edit"></i> Edit fields</a>
+					<a class="button-like-small disabled" id="toggle-annotations"><i class="fas fa-eye"></i> Show annotations</a>
+					<a class="button-like-small disabled" id="save-annotations"><i class="fas fa-save"></i> Save annotations</a>
+					<a class="button-like-small disabled" id="save-to-dataset"><i class="fa-solid fa-right-from-bracket"></i> Write to dataset</a>
+				</dd>
+			</div>
+			<div class="fullwidth" id="annotations-editor-container">
+				{% include "explorer/annotations-editor.html" %}
+			</div>
+			<div class="fullwidth" id="sort-options">
+				<dt>Sort posts</dt>
+				<dd>
+					<select class="sort-select" id="column-sort-select" name="sort-select">
+						<option class="sort-option" value="dataset-order">Dataset order</option>
+						<option disabled value>dataset columns:</option>
+						{% for column in dataset.get_columns() %}
+							<option class="sort-option" value="{{ column }}">{{ column }}</option>
+						{% endfor %}
+					</select>
+					<select class="sort-select" id="column-sort-order" name="sort-select">
+						<option class="sort-option" value="regular" {% if reverse != true %}selected{% endif %}>Low to high</option>
+						<option class="sort-option" value="reverse" {% if reverse == true %}selected{% endif %}>High to low</option>
+					</select>
+				</dd>
+			</div>
+		</div>
+	</div>
 
 </article>
diff --git a/webtool/templates/explorer/explorer.html b/webtool/templates/explorer/explorer.html
index 4ab1fd206..05fa4c0a5 100644
--- a/webtool/templates/explorer/explorer.html
+++ b/webtool/templates/explorer/explorer.html
@@ -23,17 +23,12 @@
 	</style>
 {% endif %}
 
-<span id="dataset-key">{{ key }}</span>
+{% set key = dataset.data.key %}
 
 {% include "explorer/controls.html" %}
 
 {% include "explorer/pagination.html" %}
 
-{% if not thread %}
-	{% include "explorer/annotations.html" %}
-{% endif %}
-
-
 <div class="content">
 	<ol class="posts" id="explorer-posts">
 	{% for post in posts %}
diff --git a/webtool/views/views_explorer.py b/webtool/views/views_explorer.py
index 576c1e484..7e1b389a9 100644
--- a/webtool/views/views_explorer.py
+++ b/webtool/views/views_explorer.py
@@ -21,15 +21,15 @@
 from webtool import app, db, openapi, limiter, config
 from webtool.lib.helpers import format_chan_post, error, setting_required
 from common.lib.dataset import DataSet
-from common.lib.helpers import strip_tags
+from common.lib.helpers import strip_tags, convert_to_float
 from common.lib.exceptions import DataSetException
 
 from common.config_manager import ConfigWrapper
 config = ConfigWrapper(config, user=current_user, request=request)
 api_ratelimit = limiter.shared_limit("45 per minute", scope="api")
 
-@app.route('/result/<string:key>/explorer/', defaults={'page': 1})
-@app.route('/result/<string:key>/explorer/page/<int:page>')
+@app.route('/results/<string:key>/explorer/', defaults={'page': 1})
+@app.route('/results/<string:key>/explorer/page/<int:page>')
 @api_ratelimit
 @login_required
 @setting_required("privileges.can_use_explorer")
@@ -62,14 +62,14 @@ def explorer_dataset(key, page=1):
 		return error(403, error="This dataset is private.")
 
 	if len(dataset.get_genealogy()) > 1:
-		return error(404, error="Unavailable for top-level datasets")
+		return error(404, error="Only available for top-level datasets.")
 
 	results_path = dataset.check_dataset_finished()
 	if not results_path:
-		return error(404, error="This dataset didn't finish executing")
+		return error(404, error="This dataset didn't finish executing.")
 
 	if datasource not in config.get("explorer.config") and not config["explorer.config"][datasource]["enabled"]:
-		return error(404, error="Explorer functionality disabled for %s" % datasource)
+		return error(404, error="Explorer functionality disabled for %s." % datasource)
 
 	# The amount of posts to show on a page
 	posts_per_page = config.get("explorer.posts_per_page", 50)
@@ -90,10 +90,9 @@ def explorer_dataset(key, page=1):
 		sort = None
 
 	# Check if we have to reverse the order.
-	reverse = True if request.args.get("desc") in ("true", True) else False
-
-	# Check if we have to convert the sort value to an integer.
-	force_number = True if request.args.get("int") in ("true", True) else False
+	reverse = True if request.args.get("order") == "reverse" else False
+	print(request.args.get("order"))
+	print(reverse)
 
 	# Load posts
 	post_ids = []
@@ -161,9 +160,9 @@ def explorer_dataset(key, page=1):
 		annotations = json.loads(annotations["annotations"])
 
 	# Generate the HTML page
-	return render_template("explorer/explorer.html", key=key, datasource=datasource, has_database=has_database, parameters=parameters, annotation_fields=annotation_fields, annotations=annotations, dataset=dataset, posts=posts, custom_css=css, custom_fields=custom_fields, page=page, offset=offset, posts_per_page=posts_per_page, post_count=post_count, max_posts=max_posts)
+	return render_template("explorer/explorer.html", dataset=dataset, datasource=datasource, has_database=has_database, parameters=parameters, posts=posts, annotation_fields=annotation_fields, annotations=annotations, custom_css=css, custom_fields=custom_fields, page=page, offset=offset, posts_per_page=posts_per_page, post_count=post_count, max_posts=max_posts)
 
-@app.route('/result/<datasource>/<string:thread_id>/explorer')
+@app.route('/results/<datasource>/<string:thread_id>/explorer')
 @api_ratelimit
 @login_required
 @setting_required("privileges.can_use_explorer")
@@ -519,7 +518,7 @@ def get_image_file(img_file):
 
 	return send_file(str(image_path))
 
-def sort_and_iterate_items(dataset, sort=None, reverse=False, force_number=False, **kwargs):
+def sort_and_iterate_items(dataset, sort=None, reverse=False, **kwargs):
 	"""
 	Loop through both csv and NDJSON files.
 	This is basically a wrapper function for `iterate_items()` with the
@@ -531,24 +530,22 @@ def sort_and_iterate_items(dataset, sort=None, reverse=False, force_number=False
 	:param key, str:			The dataset object.
 	:param sort_by, str:		The item key that determines the sort order.
 	:param reverse, bool:		Whether to sort by largest values first.
-	:param force_number, bool:	Whether the sort value should be converted to a
-								floating point number.
 	"""
 
 	# Storing posts in the right order here
-	posts = []
-
-	# Generate reader on the basis of sort value
-	# At the moment, this is very inefficient, but
-	# suffices for the few cases where `sort` is used.
-	#sort_by_index = next(reader).index(sort)
-	#reader = sorted(reader, key=lambda x: convert_to_float(x[sort_by_index]) if len(x) >= sort_by_index else 0, reverse=True)
-	#sorted([json.loads(line) for line in infile], key=lambda x: convert_to_float(flatten_dict(x)[sort]), reverse=True)
+	sorted_posts = []
 
-	for item in sorted(dataset.iterate_items(**kwargs), key=lambda x: x[sort]):
-		posts.append(item)
-
-	for post in posts:
+	try:
+		for item in sorted(dataset.iterate_items(**kwargs), key=lambda x: x[sort], reverse=reverse):
+			sorted_posts.append(item)
+	except TypeError:
+		# Dataset fields can contain integers and empty strings.
+		# Since these cannot be compared, we will convert every
+		# empty string to 0.
+		for item in sorted(dataset.iterate_items(**kwargs), key=lambda x: convert_to_float(x[sort]), reverse=reverse):
+			sorted_posts.append(item)
+
+	for post in sorted_posts:
 		yield post
 
 def get_database_posts(db, datasource, ids, board="", threads=False, limit=0, offset=0, order_by=["timestamp"]):

From c33fd721a7b5aa50b06803043de1dd0666edf238 Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Thu, 18 Apr 2024 19:55:30 +0200
Subject: [PATCH 018/204] Style and fix annotation field editor, enable config
 settings for CSS

---
 .../dmi-tcatv2/explorer/dmi-tcat-explorer.css |  84 ---------
 webtool/static/css/dataset-page.css           |  22 +++
 .../{explorer-default.css => default.css}     |   0
 .../static/css/explorer/dmi-tcat.css          |   0
 .../static/css/explorer/douyin.css            |   0
 .../static/css/explorer/fourchan.css          |   0
 .../static/css/explorer/instagram.css         |   0
 .../static/css/explorer/reddit.css            |   0
 .../{telegram-search.css => telegram.css}     |   0
 .../static/css/explorer/tiktok.css            |   0
 .../static/css/explorer/tiktok_urls.css       |   0
 .../static/css/explorer/tumblr.css            |   0
 .../static/css/explorer/twitter-import.css    |   0
 .../static/css/explorer/twitterv2.css         |   0
 webtool/static/css/stylesheet.css             |  10 +-
 webtool/static/js/explorer.js                 | 165 +++++++++---------
 .../components/result-result-row.html         |   4 +-
 .../explorer/annotations-editor.html          | 105 ++++++-----
 webtool/templates/explorer/controls.html      | 118 +++++++------
 webtool/templates/explorer/explorer.html      |  21 ++-
 webtool/templates/explorer/pagination.html    |   1 +
 webtool/templates/explorer/post.html          |  14 +-
 webtool/views/views_explorer.py               |  43 +----
 23 files changed, 264 insertions(+), 323 deletions(-)
 delete mode 100644 datasources/dmi-tcatv2/explorer/dmi-tcat-explorer.css
 rename webtool/static/css/explorer/{explorer-default.css => default.css} (100%)
 rename datasources/dmi-tcat/explorer/dmi-tcat-explorer.css => webtool/static/css/explorer/dmi-tcat.css (100%)
 rename datasources/douyin/explorer/douyin-explorer.css => webtool/static/css/explorer/douyin.css (100%)
 rename datasources/fourchan/explorer/fourchan-explorer.css => webtool/static/css/explorer/fourchan.css (100%)
 rename datasources/instagram/explorer/instagram-explorer.css => webtool/static/css/explorer/instagram.css (100%)
 rename datasources/reddit/explorer/reddit-explorer.css => webtool/static/css/explorer/reddit.css (100%)
 rename webtool/static/css/explorer/{telegram-search.css => telegram.css} (100%)
 rename datasources/tiktok/explorer/tiktok-explorer.css => webtool/static/css/explorer/tiktok.css (100%)
 rename datasources/tiktok_urls/explorer/tiktok_urls-explorer.css => webtool/static/css/explorer/tiktok_urls.css (100%)
 rename datasources/tumblr/explorer/tumblr-explorer.css => webtool/static/css/explorer/tumblr.css (100%)
 rename datasources/twitter-import/explorer/twitter-import-explorer.css => webtool/static/css/explorer/twitter-import.css (100%)
 rename datasources/twitterv2/explorer/twitterv2-explorer.css => webtool/static/css/explorer/twitterv2.css (100%)

diff --git a/datasources/dmi-tcatv2/explorer/dmi-tcat-explorer.css b/datasources/dmi-tcatv2/explorer/dmi-tcat-explorer.css
deleted file mode 100644
index 86bf76e27..000000000
--- a/datasources/dmi-tcatv2/explorer/dmi-tcat-explorer.css
+++ /dev/null
@@ -1,84 +0,0 @@
-/* 
-
-See https://github.com/digitalmethodsinitiative/4cat/wiki/Exploring-and-annotating-datasets for information on how to add custom CSS.
- 
- */
-
-body {
-	background-color: white;
-}
-
-.posts .post {
-	position: relative;
-	background-color: white;
-	max-width: 620px;
-	border: 1px solid #d6d6d6;
-	border-radius: 10px;
-	min-height: 50px;
-}
-
-.posts .post header {
-	display: inline-block;
-	line-height: 1.7em;
-	margin-bottom: 5px;
-	border: none;
-	color: rgb(104, 119, 130);
-}
-
-.posts .post header .post_id {
-	display: none;
-}
-
-.posts .post header .author {
-	color: black;
-}
-
-.posts .post header .profile_picture {
-	float: left;
-	margin-right: 15px;
-}
-
-.posts .post header .profile_picture img {
-	border-radius: 100px;
-	width: 50px;
-}
-
-.posts .post header .profile_picture:after {
-	display: none;
-}
-
-.posts .post article {
-	margin: 0;
-	padding: 0;
-}
-
-.posts .post.op {
-	background-color: white;
-	color: black;
-}
-
-.posts .post .post-content {
-	display: inline-block;
-}
-
-.posts .post .post-image {
-	margin-bottom: 10px;
-}
-
-.posts .post .post-image img {
-	border-radius: 10px;
-}
-
-.posts .external-url {
-	color: rgb(104, 119, 130);
-}
-
-.posts .post.op .post-annotations, .posts .post .post-annotations {
-	border-radius: 10px;
-	background-color: rgb(241, 249, 255);
-	color: #474747;
-}
-
-span.hashtag {
-	color: rgb(29, 155, 240);
-}
\ No newline at end of file
diff --git a/webtool/static/css/dataset-page.css b/webtool/static/css/dataset-page.css
index e82742bda..c9b04c605 100644
--- a/webtool/static/css/dataset-page.css
+++ b/webtool/static/css/dataset-page.css
@@ -694,4 +694,26 @@ body.image-preview {
 #image-zoom:checked + label img {
     max-height: none;
     cursor: zoom-out;
+}
+
+/* Explorer view */
+#annotation-fields-editor-controls {
+    display: grid;
+    grid-template-columns: auto auto auto;
+}
+
+#annotation-fields-editor-controls>div {
+    border-bottom: 1px solid var(--contrast-bright);
+}
+
+#edit-annotation-fields #input-warning {
+    color: var(--accent-error);
+}
+
+/* Remove all styles for explorer posts */
+/* these ought to be defined specifically */
+/* and 4CAT styles shouldn't interfere. */
+#explorer-posts, #explorer-posts > ol li {
+    all: initial;
+    padding: 0;
 }
\ No newline at end of file
diff --git a/webtool/static/css/explorer/explorer-default.css b/webtool/static/css/explorer/default.css
similarity index 100%
rename from webtool/static/css/explorer/explorer-default.css
rename to webtool/static/css/explorer/default.css
diff --git a/datasources/dmi-tcat/explorer/dmi-tcat-explorer.css b/webtool/static/css/explorer/dmi-tcat.css
similarity index 100%
rename from datasources/dmi-tcat/explorer/dmi-tcat-explorer.css
rename to webtool/static/css/explorer/dmi-tcat.css
diff --git a/datasources/douyin/explorer/douyin-explorer.css b/webtool/static/css/explorer/douyin.css
similarity index 100%
rename from datasources/douyin/explorer/douyin-explorer.css
rename to webtool/static/css/explorer/douyin.css
diff --git a/datasources/fourchan/explorer/fourchan-explorer.css b/webtool/static/css/explorer/fourchan.css
similarity index 100%
rename from datasources/fourchan/explorer/fourchan-explorer.css
rename to webtool/static/css/explorer/fourchan.css
diff --git a/datasources/instagram/explorer/instagram-explorer.css b/webtool/static/css/explorer/instagram.css
similarity index 100%
rename from datasources/instagram/explorer/instagram-explorer.css
rename to webtool/static/css/explorer/instagram.css
diff --git a/datasources/reddit/explorer/reddit-explorer.css b/webtool/static/css/explorer/reddit.css
similarity index 100%
rename from datasources/reddit/explorer/reddit-explorer.css
rename to webtool/static/css/explorer/reddit.css
diff --git a/webtool/static/css/explorer/telegram-search.css b/webtool/static/css/explorer/telegram.css
similarity index 100%
rename from webtool/static/css/explorer/telegram-search.css
rename to webtool/static/css/explorer/telegram.css
diff --git a/datasources/tiktok/explorer/tiktok-explorer.css b/webtool/static/css/explorer/tiktok.css
similarity index 100%
rename from datasources/tiktok/explorer/tiktok-explorer.css
rename to webtool/static/css/explorer/tiktok.css
diff --git a/datasources/tiktok_urls/explorer/tiktok_urls-explorer.css b/webtool/static/css/explorer/tiktok_urls.css
similarity index 100%
rename from datasources/tiktok_urls/explorer/tiktok_urls-explorer.css
rename to webtool/static/css/explorer/tiktok_urls.css
diff --git a/datasources/tumblr/explorer/tumblr-explorer.css b/webtool/static/css/explorer/tumblr.css
similarity index 100%
rename from datasources/tumblr/explorer/tumblr-explorer.css
rename to webtool/static/css/explorer/tumblr.css
diff --git a/datasources/twitter-import/explorer/twitter-import-explorer.css b/webtool/static/css/explorer/twitter-import.css
similarity index 100%
rename from datasources/twitter-import/explorer/twitter-import-explorer.css
rename to webtool/static/css/explorer/twitter-import.css
diff --git a/datasources/twitterv2/explorer/twitterv2-explorer.css b/webtool/static/css/explorer/twitterv2.css
similarity index 100%
rename from datasources/twitterv2/explorer/twitterv2-explorer.css
rename to webtool/static/css/explorer/twitterv2.css
diff --git a/webtool/static/css/stylesheet.css b/webtool/static/css/stylesheet.css
index 0fd5d7733..8c928e78e 100644
--- a/webtool/static/css/stylesheet.css
+++ b/webtool/static/css/stylesheet.css
@@ -956,6 +956,11 @@ article section.data-overview .description {
     color: var(--contrast-bright);
 }
 
+.pagination .details {
+    margin: 0 auto;
+    text-align: center;
+}
+
 .tabs {
     border-bottom: 1px dotted var(--contrast-dark);
     max-height: 5em;
@@ -1206,8 +1211,3 @@ ol.result-list li.has_results .property-container.analysis a {
     padding: 0;
     margin: 0.5em 0 0 0;
 }
-
-#explorer-posts, #explorer-posts > ol li {
-    all: initial;
-    padding: 0;
-}
\ No newline at end of file
diff --git a/webtool/static/js/explorer.js b/webtool/static/js/explorer.js
index 8c45e9b7e..ac160e269 100644
--- a/webtool/static/js/explorer.js
+++ b/webtool/static/js/explorer.js
@@ -22,57 +22,33 @@ const annotations = {
 
 	init: function() {
 
-		// Show and hide the annotation controls
-		$("#toggle-annotation-controls").on("click", function() {
-			$("#annotation-controls-buttons").toggleClass("hidden");
-			if ($("#annotation-controls-buttons").hasClass("hidden")) {
-				$(this).html("<i class='fas fa-chevron-up'>");
-			}
-			else {
-				$(this).html("<i class='fas fa-chevron-left'>");
-			}
-		});
+		let edit_field_box = $("#edit-annotation-fields");
+		let editor = $("#annotation-fields-editor");
+		let editor_controls = $("#annotation-fields-editor-controls");
 
 		// Add a new annotation field when clicking the plus icon
-		$("#add-annotation-field").on("click", function(){$("#annotation-fields").append(annotations.getAnnotationsDiv);});
+		$("#new-annotation-field").on("click", function(){
+			let annotations_div = annotations.getAnnotationsDiv();
+			$(annotations_div).insertBefore(edit_field_box);});
 
 		// Show and hide the annotations editor
 		$("#toggle-annotation-fields").on("click", function(){
-			$("#annotations-editor-container").toggle();
-			if ($("#annotation-controls-buttons").hasClass("hidden")) {
-				$(this).html("<i class='fas fa-chevron-up'>");
+			editor.toggleClass("hidden");
+			if (editor.hasClass("hidden")) {
+				$("#toggle-annotation-fields").html("<i class='fas fa-edit'></i> Edit fields");
 			}
 			else {
-				$(this).html("<i class='fas fa-chevron-left'>");
+				$("#toggle-annotation-fields").html("<i class='fas fa-eye-slash'></i> Hide editor");
 			}
 		});
-		$("#annotations-editor").click(function(e) {
-		   e.stopPropagation();
-		});
 
 		// Keep track of when the annotation fields were edited.
-		$("#annotation-fields").on("click", "#add-annotation-field, .delete-input, .delete-input i, .delete-option-field, .delete-option-field i", function() {
-			$("#save-annotation-fields").removeClass("invalid").removeAttr("disabled");
-		});
-		$("#annotation-fields").on("change keydown", "input, select", function() {
-			$("#save-annotation-fields").removeClass("invalid").removeAttr("disabled");
-		});
-
-		// Close the annotation field editor (ask whether unsaved changes can be discarded)
-		$("#annotations-editor-container, #close-annotation-fields").click(function(e){
-			e.preventDefault();
-			if (!$("#save-annotation-fields").prop("disabled")) {
-				let conf = confirm("Close without applying input fields?");
-				if (conf) {
-					$("#annotations-editor-container").hide();
-					$("#annotation-fields").html(old_annotation_fields);
-					$("#save-annotation-fields").addClass("invalid").prop("disabled", true);
-				}
-			}
-			else {
-				$("#annotations-editor-container").hide();
-			}
+		editor_controls.on("click", "#apply-annotation-fields, .delete-input, .delete-option-field", function() {
+			$("#apply-annotation-fields").removeClass("disabled");
 		});
+		editor_controls.on("change keydown", "input, select", function() {
+			$("#apply-annotation-fields").removeClass("disabled");
+		});	
 
 		// Show and hide annotations
 		$("#toggle-annotations").on("click", function(){
@@ -82,26 +58,32 @@ const annotations = {
 		});
 
 		// Delete an entire annotation input
-		$("#annotation-fields").on("click", ".annotation-field > .delete-input", function(e){$(this).parent().remove();});
+		// We're in a grid of threes, so this involves three divs
+		editor_controls.on("click", ".annotation-field > .delete-input", function(e){
+				let parent_div = $(this).parent().parent();
+				parent_div.next().remove(); // Input type
+				parent_div.next().remove(); // Options
+				parent_div.remove();		// Label
+			});
 
 		// Make saving available when annotation fields are changed
-		$("#annotation-fields").on("click", ".annotation-field > .option-fields > .option-field > .delete-option-field", function() {
+		editor_controls.on("click", ".delete-option-field", function() {
 			annotations.deleteOption(this);
 		});
-		$("#annotation-fields").on("change", ".annotation-field > .annotation-field-type", function(e) {annotations.toggleField(e.target);});
+		editor_controls.on("change", ".annotation-field-type", function(e) {annotations.toggleField(e.target);});
 		
-		// Make enter add a new option field
-		$("#annotation-fields").on("keypress", "input", function(e){
+		// Make enter apply the option fields
+		editor_controls.on("keypress", "input", function(e){
 			if (e.which == 13) {
 				annotations.applyAnnotationFields();
 			}
 		});
 
-		// Save the annotations fields to the database
-		$("#save-annotation-fields").on("click", annotations.applyAnnotationFields);
+		// Save the annotation fields to the database
+		$("#apply-annotation-fields").on("click", annotations.applyAnnotationFields);
 
 		// Dynamically add a new option field when another is edited
-		$("#annotation-fields").on("keyup", ".annotation-field > .option-fields > .option-field > input", function(e) {
+		editor_controls.on("keyup", ".option-field > input", function(e) {
 			if ($(this).val().length > 0) {
 				annotations.addOptions(e.target);
 			}
@@ -131,7 +113,7 @@ const annotations = {
 		// Ask whether the next page should be opened without saving annotations
 		$('a > .page').click(function(){
 			if (!$("#save-annotations").prop('disabled')) {
-				return confirm("You'll lose unsaved annotations for this page if you don't save first.\nDo you still want to continue?");
+				return confirm("Unsaved annotations are lost if you don't save before leaving the page.\nLeave anyway?");
 			}
 		})
 
@@ -148,18 +130,20 @@ const annotations = {
 	},
 
 	toggleField: function (el) {
-
 		// Change the type of input fields when switching in the dropdown
+
 		let type = $(el).val();
 		let old_type = $(el).attr("data-val");
 
+		let options = $(el).parent().parent().next();
+		let option_fields = options.find(".option-field");
+
 		if (type == "text" || type == "textarea") {
-			$(el).parent().find(".option-fields").remove();
+			option_fields.remove();
 		}
 		else if (type == "dropdown" || type == "checkbox") {
-			if (!($(el).siblings(".option-fields").length) > 0) {
-				$(el).after("<div class='option-fields'></div>");
-				$(el).next().append(annotations.getInputField);
+			if (option_fields.length == 0) {
+				options.append(annotations.getInputField);
 			}
 		}
 	},
@@ -171,7 +155,7 @@ const annotations = {
 		// no empty fields available, add a new one.
 		let no_empty_fields = true;
 		let input_fields = $(el).parent().siblings();
-
+		console.log(input_fields)
 		if (!$(el).val().length > 0) {
 				no_empty_fields = false;
 			}
@@ -183,6 +167,7 @@ const annotations = {
 				no_empty_fields = false;
 			}
 		});
+		// Add a new field if there's no empty ones
 		if (no_empty_fields) {
 			$(el).parent().after(annotations.getInputField);
 		}
@@ -204,7 +189,7 @@ const annotations = {
 					return false;
 				}
 				$(this).append(`
-					<button class='delete-option-field'><i class='fas fa-trash'></i></button>`);
+					<a class="button-like-small delete-option-field"><i class='fas fa-trash'></i></a>`);
 			});
 		}
 	},
@@ -223,7 +208,7 @@ const annotations = {
 		// Validates and converts the fields in the annotations editor.
 		// Returns an object with the set annotation fields.
 
-		annotation_fields = {};
+		var annotation_fields = {};
 		var warning = "";
 		var labels_added = []
 
@@ -233,9 +218,14 @@ const annotations = {
 
 		// Parse information from the annotations editor.
 		$(".annotation-field").each(function(){
-
+			// To align the input form, we're in a grid of threes:
+			// label, input type, options.
+			// Navigate the DOM to get these elements:
 			let label_field = $(this).children(".annotation-field-label");
-			let label = label_field.val().replace(/\s+/g, ' ');;
+			let type_field = $(this).parent().next();
+			let options_field = $(this).parent().next().next();
+
+			let label = label_field.val().replace(/\s+/g, ' ');
 
 			// Get the random identifier of the field, so we
 			// can later check if it already exists.
@@ -253,7 +243,7 @@ const annotations = {
 			}
 
 			// Set the types and values of the annotation
-			type = $(this).children(".annotation-field-type").val();
+			type = type_field.find(".annotation-field-type").val();
 
 			// Keep track of the labels we've added
 			labels_added.push(label)
@@ -268,8 +258,8 @@ const annotations = {
 				let no_options_added = true;
 				let option_id = ""
 
-				$(this).find(".option-field > input").each(function(){
-					
+				options_field.find(".option-field > input").each(function(){
+					console.log(this)
 					let option_label = $(this).val();
 					let option_id = this.id.replace("input-", "");
 
@@ -306,10 +296,10 @@ const annotations = {
 			}
 		});
 
+		console.log(annotation_fields)
 		if (warning.length > 0) {
 			return warning;
 		}
-		console.log(annotation_fields)
 		return annotation_fields;
 	},
 
@@ -320,9 +310,8 @@ const annotations = {
 		var annotation_fields = annotations.parseAnnotationFields(e);
 		var fields_to_add = {};
 
-		
 		// Show an error message if the annotation fields were not valid.
-		if (typeof annotation_fields == 'string') {
+		if (typeof annotation_fields == "string") {
 			annotations.warnEditor(annotation_fields);
 			return
 		}
@@ -331,11 +320,11 @@ const annotations = {
 		// the annotation fields to each post on the page.
 		else {
 
-			$("#save-annotation-fields").html("<i class='fas fa-circle-notch spinner'></i> Applying")
+			$("#apply-annotation-fields").html("<i class='fas fa-circle-notch spinner'></i> Applying")
 			
 			// Remove warnings
 			annotations.warnEditor("")
-			$("#annotation-fields").find("input").each(function(){
+			$("#annotation-field").find("input").each(function(){
 				$(this).removeClass("invalid");
 			});
 			$(".option-fields").find("input").each(function(){
@@ -579,7 +568,7 @@ const annotations = {
 			}
 		}
 
-		$("#save-annotation-fields").html("<i class='fas fa-save'></i> Apply")
+		$("#apply-annotation-fields").html("<i class='fas fa-check'></i> Apply")
 	},
 
 	saveAnnotationFields: function (annotation_fields){
@@ -607,7 +596,7 @@ const annotations = {
 				// If the query is accepted by the server.
 				if (response == 'success') {
 					$("#annotations-editor-container").hide();
-					$("#save-annotation-fields").addClass("disabled");
+					$("#apply-annotation-fields").addClass("disabled");
 				}
 
 				// If the query is rejected by the server.
@@ -703,7 +692,7 @@ const annotations = {
 					annotations.enableSaving();
 					$("#save-annotations").html("<i class='fas fa-save'></i> Annotations saved");
 					$("#save-annotations").addClass("disabled");
-					old_annotation_fields = $("#annotation-fields").html();
+					old_annotation_fields = $("#annotation-field").each();
 					// alert(alert_message);
 				}
 				else {
@@ -775,7 +764,7 @@ const annotations = {
 
 	warnEditor: function(warning) {
 		
-		let warn_field = $("#annotations-input-warning");
+		let warn_field = $("#input-warning");
 		warn_field.html(warning);
 		if (warn_field.hasClass("hidden")) {
 			warn_field.removeClass("hidden");
@@ -787,13 +776,13 @@ const annotations = {
 		let ta = $("#toggle-annotations");
 		if (ta.hasClass("shown")) {
 			ta.removeClass("shown");
-			ta.html("<i class='fas fa-eye-slash'></i> Hide annotations");
-			$(".post-annotations").show(200);
+			ta.html("<i class='fas fa-eye'></i> Show annotations");
+			$(".post-annotations").addClass("hidden");
 		}
 		else {
 			ta.addClass("shown");
-			ta.html("<i class='fas fa-eye'></i> Show annotations");
-			$(".post-annotations").hide(200);
+			ta.html("<i class='fas fa-eye-slash'></i> Hide annotations");
+			$(".post-annotations").removeClass("hidden");
 		}
 	},
 
@@ -802,17 +791,25 @@ const annotations = {
 		if (id == undefined || id == 0) {
 			id = annotations.randomInt();
 		}
+		
 		// Returns an annotation div element with a pseudo-random ID
-		return `<div class='annotation-fields-row annotation-field' id='field-{{FIELD_ID}}'>
-		<input type='text' class='annotation-field-label' name='annotation-field-label' placeholder='Field name'>
-		<button class='delete-input'><i class='fas fa-trash'></i></button>
-		<select name='annotation-field-type' class='annotation-field-type'>
-		<option class='annotation-field-option' value='text'>Text</option>
-		<option class='annotation-field-option' value='textarea'>Textarea</option>
-		<option class='annotation-field-option' value='dropdown'>Dropdown</option>
-		<option class='annotation-field-option' value='checkbox'>Checkbox</option>
-		</select>
-		</div>`.replace("{{FIELD_ID}}", id);
+		return `<div>
+			<dd class="annotation-fields-row annotation-field" id="field-{{FIELD_ID}}">
+				<input type="text" class="annotation-field-label" name="annotation-field-label" placeholder="Field label">
+				<a class="button-like-small delete-input"><i class="fas fa-trash"></i></a>
+			</dd>
+		</div>
+		<div>
+			<dd>
+				<select name="annotation-field-type" class="annotation-field-type">
+					<option class="annotation-field-option" value="text" selected>Text</option>
+					<option class="annotation-field-option" value="textarea">Text (large)</option>
+					<option class="annotation-field-option" value="checkbox">Checkbox</option>
+					<option class="annotation-field-option" value="dropdown">Dropdown</option>
+				</select>
+			</dd>
+		</div>
+		<div></div>`.replace("{{FIELD_ID}}", id);
 	},
 
 	getInputField: function(id){
diff --git a/webtool/templates/components/result-result-row.html b/webtool/templates/components/result-result-row.html
index 7f71f0ef4..2d7972398 100644
--- a/webtool/templates/components/result-result-row.html
+++ b/webtool/templates/components/result-result-row.html
@@ -44,14 +44,14 @@
         <div role="dialog" id="popup-preview-{{ dataset.key }}"></div>
         </li>
     {% endif %}
-    <li>
     {% if __user_config("privileges.can_use_explorer") and has_explorer %}
+    <li>
         <a href="{{ url_for('explorer_dataset', key=dataset.key) }}" class="tooltip-trigger" aria-controls="tooltip-explore-items-{{ dataset.key }}">
             <i class="fa fa-binoculars" aria-hidden="true"></i> Explorer
         </a>
         <p role="tooltip" id="tooltip-explore-items-{{ dataset.key }}" aria-hidden="true">Explore, sort, and add annotations to data interactively</p>
-    {% endif %}
     </li>
+    {% endif %}
     </ul>
   {% endif %}
 </div>
\ No newline at end of file
diff --git a/webtool/templates/explorer/annotations-editor.html b/webtool/templates/explorer/annotations-editor.html
index 71c7dc5f3..f16c7c325 100644
--- a/webtool/templates/explorer/annotations-editor.html
+++ b/webtool/templates/explorer/annotations-editor.html
@@ -1,55 +1,80 @@
-<div id="annotations-editor" class="fullwidth saved">
-	<span class="helper"></span>
+<!-- <div id="annotations-editor" class="saved">
+<span class="helper"></span> -->
+<!-- <p id="notice"><em>Note: Changing input types will overwrite existing annotations for the field</em></p> -->
+<!-- <span id="annotation-fields"></span> -->
+<div id="annotation-fields-editor-controls">
+	<div>
+		<dd>
+			<em>Label</em>
+			<button class="tooltip-trigger" aria-controls="tooltip-annotation-editor-label">?</button>
+			<p role="tooltip" id="tooltip-annotation-editor-label" aria-hidden="true">The label of the annotation field</p>
+		</dd>
+	</div>
+	<div>
+		<dd>
+			<em>Input type</em>
+			<button class="tooltip-trigger" aria-controls="tooltip-annotation-editor-type">?</button>
+			<p role="tooltip" id="tooltip-annotation-editor-type" aria-hidden="true">The type of annotation field. Available types
+			include a text field, a large text field, a checkbox, and a dropdown menu.</p>
+		</dd>
+	</div>
+	<div>
+		<dd>
+			<em>Options</em>
+			<button class="tooltip-trigger" aria-controls="tooltip-annotation-editor-options">?</button>
+			<p role="tooltip" id="tooltip-annotation-editor-options" aria-hidden="true">Available options for checkbox and dropdown fields.</p>
+		</dd>
+	</div>
 
-	<div id="annotation-fields-container">
-		<div class="annotation-headers">
-			<div class="annotation-header" id="ah-label">Label</div>
-			<div class="annotation-header" id="ah-type">Input type</div>
-			<div class="annotation-header" id="ah-options">Options</div>
-		</div>
+	{% if annotation_fields %}
 
-		<div id="annotation-fields">
-		{% if annotation_fields %}
-			
-		{% for field in annotation_fields %}
+	{% for field in annotation_fields %}
+	{% set annotation_type = annotation_fields[field]["type"] %}
+	{% set label = annotation_fields[field]["label"] %}
 
-			{% set annotation_type = annotation_fields[field]["type"] %}
-			{% set label = annotation_fields[field]["label"] %}
-			<div class="annotation-fields-row annotation-field" id="field-{{ field }}">
+	<div>
+		<dd class="annotation-fields-row annotation-field" id="field-{{ field }}">
 			<input type="text" class="annotation-field-label" name="annotation-field-label" placeholder="Field label" value="{{ label }}">
-			<button class="delete-input"><i class="fas fa-trash"></i></button>
+			<a class="button-like-small delete-input"><i class="fas fa-trash"></i></a>
+		</dd>
+	</div>
+	<div>
+		<dd>
 			<select name="annotation-field-type" class="annotation-field-type">
 				<option class="annotation-field-option" value="text" {% if annotation_type == "text" %}selected{% endif %}>Text</option>
-				<option class="annotation-field-option" value="textarea" {% if annotation_type == "textarea" %}selected{% endif %}>Textarea</option>
-				<option class="annotation-field-option" value="dropdown" {% if annotation_type == "dropdown" %}selected{% endif %}>Dropdown</option>
+				<option class="annotation-field-option" value="textarea" {% if annotation_type == "textarea" %}selected{% endif %}>Text (large)</option>
 				<option class="annotation-field-option" value="checkbox" {% if annotation_type == "checkbox" %}selected{% endif %}>Checkbox</option>
+				<option class="annotation-field-option" value="dropdown" {% if annotation_type == "dropdown" %}selected{% endif %}>Dropdown</option>
 			</select>
+		</dd>
+	</div>
 
-			{% if annotation_type == "dropdown" or annotation_type == "checkbox" %}
-				<div class="option-fields">
+	{% if annotation_type == "dropdown" or annotation_type == "checkbox" %}
+	<div class="option-fields">
+		<dd>
 				{% for option in annotation_fields[field]["options"] %}
-					{% set option_id = option.keys() | first %}
-					{% set option_label = option.values() | first %}
-					<div class="option-field">
-						<input type="text" id="input-{{ option_id }}" value="{{ option_label }}">
-						<button class="delete-option-field"><i class="fas fa-trash"></i></button>
-					</div>
+				{% set option_id = option.keys() | first %}
+				{% set option_label = option.values() | first %}
+				<div class="option-field">
+					<input type="text" id="input-{{ option_id }}" value="{{ option_label }}">
+					<a class="button-like-small delete-option-field"><i class="fas fa-trash"></i></a>
+				</div>
 				{% endfor %}
-					<div class="option-field">
-						<input type="text" id="input-{{ range(1, 100000000) | random }}" placeholder="Value">
-						
-					</div>
+				<div class="option-field">
+					<input type="text" id="input-{{ range(1, 100000000) | random }}" placeholder="Value">
 				</div>
-			{% endif %}
-			</div>
-		{% endfor %}
-		{% endif %}
-		</div>
+		</dd>
 	</div>
-	<div id="add-annotation-fields">
-		<span id="annotations-input-warning" class="hidden"></span>
-		<button id="add-annotation-field"><i class="fas fa-plus"></i> New field</button>
-		<button id="save-annotation-fields" class="invalid" disabled><i class="fas fa-save"></i> Apply</button>
-	<p id="notice"><em>Note: Changing input types will overwrite existing annotations for the field</em></p>
+	{% else %}
+	<div class="option-fields"></div>
+	{% endif %}
+	{% endfor %}
+	{% endif %}
+	<div id="edit-annotation-fields">
+		<dd>
+			<a class="button-like-small" id="new-annotation-field"><i class="fas fa-plus"></i> New field</a>
+			<a class="button-like-small disabled" id="apply-annotation-fields"><i class="fa-solid fa-check"></i> Apply</a>
+			<br><span class="hidden" id="input-warning"></span>
+		</dd>
 	</div>
 </div>
\ No newline at end of file
diff --git a/webtool/templates/explorer/controls.html b/webtool/templates/explorer/controls.html
index 3dc77990d..bf1921896 100644
--- a/webtool/templates/explorer/controls.html
+++ b/webtool/templates/explorer/controls.html
@@ -1,61 +1,67 @@
-	<article class="result" id="explorer-controls" data-dataset-key="{{ dataset.key }}">
-	<h2>
-		{{ dataset.get_label() }}
-	</h2>
-	<div class="fullwidth-notice">
-		<a href="/results/{{ key }}"><i class="fas fa-arrow-left"></i> Return to dataset overview</a>
-	</div>
-	<span style="display: none" id="dataset-key">{{ key }}</span>
-	{% if custom_fields and custom_fields[0] == "invalid" %}
-	<div class="fullwidth-notice">
-		Invalid custom fields JSON - can't show posts properly ({{ custom_fields[1] }}).
-	</div>
-	{% endif %}
-	{% if key and post_count > max_posts %}
-	<div class="fullwidth-notice">Large dataset - only showing the first {{ max_posts }} posts. Use filter processors to limit the dataset.</div>
-	{% set post_count = max_posts %}
-	{% endif %}
-
-	<div class="fullwidth-notice">
-		{% if not key and has_database %}
-			Showing {{ post_count }} posts from {{ datasource }}.
-		{% else %}
-			Showing posts {{ offset + 1 }} - {{ post_count if (offset + posts_per_page) > post_count else (offset + posts_per_page) }} ({{ post_count }} in total).
+<article class="result" id="explorer-controls" data-dataset-key="{{ dataset.key }}">
+	<div class="query card">
+		<h2>
+			{{ dataset.get_label() }} - Explorer
+		</h2>
+		<span style="display: none" id="dataset-key">{{ key }}</span>
+		{% if custom_fields and custom_fields[0] == "invalid" %}
+		<div class="fullwidth-notice">
+			Invalid custom fields JSON - can't show posts properly ({{ custom_fields[1] }}).
+		</div>
 		{% endif %}
-	</div>
-	<nav class="dataset-toolbox">
-	</nav>
-	<div class="card-content">
-		<div class="metadata-wrapper">
-			<div class="fullwidth" id="annotation-options">
-				<dt>Annotations</dt>
-				<dd>
-					<a class="button-like-small" id="toggle-annotation-fields"><i class="fas fa-edit"></i> Edit fields</a>
-					<a class="button-like-small disabled" id="toggle-annotations"><i class="fas fa-eye"></i> Show annotations</a>
-					<a class="button-like-small disabled" id="save-annotations"><i class="fas fa-save"></i> Save annotations</a>
-					<a class="button-like-small disabled" id="save-to-dataset"><i class="fa-solid fa-right-from-bracket"></i> Write to dataset</a>
-				</dd>
-			</div>
-			<div class="fullwidth" id="annotations-editor-container">
-				{% include "explorer/annotations-editor.html" %}
-			</div>
-			<div class="fullwidth" id="sort-options">
-				<dt>Sort posts</dt>
-				<dd>
-					<select class="sort-select" id="column-sort-select" name="sort-select">
-						<option class="sort-option" value="dataset-order">Dataset order</option>
-						<option disabled value>dataset columns:</option>
-						{% for column in dataset.get_columns() %}
+		{% if key and post_count > max_posts %}
+		<div class="fullwidth-notice">Large dataset - only showing the first {{ max_posts }} posts. Use filter processors to limit the dataset.</div>
+		{% set post_count = max_posts %}
+		{% endif %}
+
+		<div class="card-content">
+			<dl class="metadata-wrapper">
+				<div class="fullwidth notice">
+					<a href="/results/{{ key }}"><i class="fas fa-arrow-left"></i> Return to dataset overview</a>
+				</div>
+				<div class="fullwidth dataset-toolbox">
+					<dt>Annotations</dt>
+					<ul>
+						<li>
+							<a class="button-like-small" id="toggle-annotation-fields"><i class="fas fa-edit"></i> Edit fields</a>
+						</li>
+						<li><a class="button-like-small {% if annotation_fields %}disabled{% endif %}" id="toggle-annotations"><i class="fas fa-eye"></i> Show annotations</li></a>
+						<li><a class="button-like-small {% if annotation_fields and annotations %}disabled{% endif %}" id="save-annotations"><i class="fas fa-save"></i> Save annotations</li></a>
+						<li><a class="button-like-small {% if annotation_fields and annotations %}disabled{% endif %}" id="save-to-dataset"><i class="fa-solid fa-right-from-bracket"></i> Write to dataset</li></a>
+					</ul>
+				</div>
+				
+				<div class="fullwidth hidden" id ="annotation-fields-editor">
+					<dt>Edit annotation fields</dt>	
+					{% include "explorer/annotations-editor.html" %}
+				</div>
+				<div class="fullwidth" id="sort-options">
+					<dt>Sort posts by</dt>
+					<dd>
+						<select class="sort-select" id="column-sort-select" name="sort-select">
+							<option class="sort-option" value="dataset-order">Dataset order</option>
+							<option disabled value>dataset columns:</option>
+							{% for column in dataset.get_columns() %}
 							<option class="sort-option" value="{{ column }}">{{ column }}</option>
-						{% endfor %}
-					</select>
-					<select class="sort-select" id="column-sort-order" name="sort-select">
-						<option class="sort-option" value="regular" {% if reverse != true %}selected{% endif %}>Low to high</option>
-						<option class="sort-option" value="reverse" {% if reverse == true %}selected{% endif %}>High to low</option>
-					</select>
-				</dd>
-			</div>
+							{% endfor %}
+						</select>
+						<select class="sort-select" id="column-sort-order" name="sort-select">
+							<option class="sort-option" value="regular" {% if reverse != true %}selected{% endif %}>Low to high</option>
+							<option class="sort-option" value="reverse" {% if reverse == true %}selected{% endif %}>High to low</option>
+						</select>
+					</dd>
+				</div>
+				<div class="fullwidth">
+					<dt>Now showing</dt>
+					<dd>
+					{% if not key and has_database %}
+						{{ post_count }} posts from {{ datasource }}.
+					{% else %}
+						Posts {{ offset + 1 }}—{{ post_count if (offset + posts_per_page) > post_count else (offset + posts_per_page) }} ({{ post_count }} total).
+					{% endif %}
+					</dd>
+				</div>
+			</dl>
 		</div>
 	</div>
-
 </article>
diff --git a/webtool/templates/explorer/explorer.html b/webtool/templates/explorer/explorer.html
index 05fa4c0a5..d2d6e3b9a 100644
--- a/webtool/templates/explorer/explorer.html
+++ b/webtool/templates/explorer/explorer.html
@@ -6,8 +6,6 @@
 
 {% block body %}
 
-<!-- <link rel="stylesheet" type="text/css" href="{{url_for('static', filename='css/explorer.css')}}"> -->
-<link rel="stylesheet" type="text/css" href="{{url_for('static', filename='css/flags.css')}}">
 
 <script type="text/javascript" src="{{url_for('static', filename='js/explorer.js')}}"></script>
 
@@ -16,12 +14,6 @@
 var annotation_fields = {% if annotation_fields %}{{ annotation_fields | safe }}{% else %}""{% endif %}
 </script>
 
-<!-- Include datasource-specific css if it exists -->
-{% if custom_css %}
-	<style>
-		{{ custom_css | safe }}"
-	</style>
-{% endif %}
 
 {% set key = dataset.data.key %}
 
@@ -29,6 +21,19 @@
 
 {% include "explorer/pagination.html" %}
 
+<!-- CSS: Use generic template, data source preset, or custom -->
+{% if "css" in datasource_config %}
+	{% if datasource_config.css == "preset" %}
+		<link rel="stylesheet" type="text/css" href="{{url_for('static', filename='css/explorer/')}}{{ datasource}}.css">
+	{% elif datasource_config.css == "custom" %}
+		<style>{{ datasource_config.custom_css | safe }}</style>
+	{% else %}
+		<link rel="stylesheet" type="text/css" href="{{url_for('static', filename='css/explorer/default.css')}}">
+	{% endif %}
+{% endif %}
+
+<link rel="stylesheet" type="text/css" href="{{url_for('static', filename='css/flags.css')}}">
+
 <div class="content">
 	<ol class="posts" id="explorer-posts">
 	{% for post in posts %}
diff --git a/webtool/templates/explorer/pagination.html b/webtool/templates/explorer/pagination.html
index ebf522890..2161f22bd 100644
--- a/webtool/templates/explorer/pagination.html
+++ b/webtool/templates/explorer/pagination.html
@@ -1,4 +1,5 @@
 <nav class="pagination">
+
 	{% if post_count > max_posts %}
 		{% set post_count = max_posts %}
 	{% endif %}
diff --git a/webtool/templates/explorer/post.html b/webtool/templates/explorer/post.html
index ac6827fc9..7c8cca3ce 100644
--- a/webtool/templates/explorer/post.html
+++ b/webtool/templates/explorer/post.html
@@ -6,7 +6,7 @@
 
 	<!-- Possible external link if not pseudonymised -->
 	{% if custom_fields and "external_url" in custom_fields %}
-		{% set external_url = custom_fields.external_url | post_field(post) %}
+		{% set external_url = custom_fields.external_url | dict_field(post) %}
 		{% if external_url and pseudonymised %}
 			<span class="external-url deactivated" title="External URLs unavailable for pseudonymised datasets"><i class="fas fa-external-link-alt"></i></span>
 		{% elif external_url and not pseudonymised %}
@@ -22,7 +22,7 @@
 		{% else %}
 			<span title="Author" class="author">
 			{% if custom_fields and custom_fields.get("author") %}
-				{% set author = custom_fields.author | post_field(post) | safe %}
+				{% set author = custom_fields.author | dict_field(post) | safe %}
 			{% else %}
 				{% set author = post.author %}
 			{% endif %}
@@ -48,7 +48,7 @@
 
 				{% if custom_field not in special_fields %}
 					
-					{% set custom_value = custom_fields[custom_field] | post_field(post) | safe %}
+					{% set custom_value = custom_fields[custom_field] | dict_field(post) | safe %}
 					{% if custom_value and custom_value != "None" %}
 						{% if not ("author" in custom_field and pseudonymised) %}
 							<span title="{{ custom_field }}" class="{{ custom_field }}">{{ custom_value | safe }}</span>
@@ -65,7 +65,7 @@
 	<article>
 		<!-- Singular image -->
 		{% if custom_fields and 'image' in custom_fields %}
-			{% set img_link = custom_fields['image'] | post_field(post) %}
+			{% set img_link = custom_fields['image'] | dict_field(post) %}
 			<!-- We'll display a not-found image if 'not-found:' is in the image link -->
 			{% if 'not-found:' in img_link %}
 				<div class="post-image">
@@ -90,7 +90,7 @@
 		
 		<!-- Multiple images -->
 		{% elif custom_fields and 'images' in custom_fields %}
-			{% set img_links = custom_fields['images'] | post_field(post) %}
+			{% set img_links = custom_fields['images'] | dict_field(post) %}
 			{% if img_links %}
 				{% set img_links = img_links.split(",") %}
 				<div class="post-images">
@@ -107,7 +107,7 @@
 
 	<span class="post-content">
 		{% if custom_fields and 'body' in custom_fields %}
-			{{ custom_fields.body | post_field(post) | safe }}
+			{{ custom_fields.body | dict_field(post) | safe }}
 		{% else %}
 			{{ post.body | safe }}
 		{% endif %}
@@ -116,7 +116,7 @@
 	</article>
 
 	<!-- Annotations -->
-	<div class="post-annotations">
+	<div class="post-annotations hidden">
 		
 		{% if annotation_fields %}
 			{% set old_annotations = None %}
diff --git a/webtool/views/views_explorer.py b/webtool/views/views_explorer.py
index 7e1b389a9..80f265a57 100644
--- a/webtool/views/views_explorer.py
+++ b/webtool/views/views_explorer.py
@@ -68,9 +68,11 @@ def explorer_dataset(key, page=1):
 	if not results_path:
 		return error(404, error="This dataset didn't finish executing.")
 
-	if datasource not in config.get("explorer.config") and not config["explorer.config"][datasource]["enabled"]:
+	if not config.get("explorer.config", {}).get(datasource,{}).get("enabled"):
 		return error(404, error="Explorer functionality disabled for %s." % datasource)
 
+	datasource_config = config.get("explorer.config", {}).get(datasource,{})
+
 	# The amount of posts to show on a page
 	posts_per_page = config.get("explorer.posts_per_page", 50)
 
@@ -91,8 +93,6 @@ def explorer_dataset(key, page=1):
 
 	# Check if we have to reverse the order.
 	reverse = True if request.args.get("order") == "reverse" else False
-	print(request.args.get("order"))
-	print(reverse)
 
 	# Load posts
 	post_ids = []
@@ -127,12 +127,6 @@ def explorer_dataset(key, page=1):
 			if count >= (offset + posts_per_page) or count > max_posts:
 				break
 
-	# Retrieve custom CSS if it is present in the datasource's config.
-	# If not given, we use a standard template. This standard CSS template
-	# can also be changed in the 4CAT control panel under the 'Explorer'
-	# settings.
-	css = get_custom_css(datasource)
-
 	# Include custom fields if it they are in the datasource's 'explorer' dir.
 	# The file's naming format should e.g. be 'reddit-explorer.json'.
 	# For some datasources (e.g. Twitter) we also have to explicitly set
@@ -160,7 +154,7 @@ def explorer_dataset(key, page=1):
 		annotations = json.loads(annotations["annotations"])
 
 	# Generate the HTML page
-	return render_template("explorer/explorer.html", dataset=dataset, datasource=datasource, has_database=has_database, parameters=parameters, posts=posts, annotation_fields=annotation_fields, annotations=annotations, custom_css=css, custom_fields=custom_fields, page=page, offset=offset, posts_per_page=posts_per_page, post_count=post_count, max_posts=max_posts)
+	return render_template("explorer/explorer.html", dataset=dataset, datasource=datasource, has_database=has_database, parameters=parameters, posts=posts, annotation_fields=annotation_fields, annotations=annotations, datasource_config=datasource_config, custom_fields=custom_fields, page=page, offset=offset, posts_per_page=posts_per_page, post_count=post_count, max_posts=max_posts)
 
 @app.route('/results/<datasource>/<string:thread_id>/explorer')
 @api_ratelimit
@@ -199,15 +193,11 @@ def explorer_database_thread(datasource, board, thread_id):
 	posts = [strip_html(post) for post in posts]
 	posts = [format(post, datasource=datasource) for post in posts]
 
-	# Include custom css if it exists in the datasource's 'explorer' dir.
-	# The file's naming format should e.g. be 'reddit-explorer.css'.
-	css = get_custom_css(datasource)
-
 	# Include custom fields if it they are in the datasource's 'explorer' dir.
 	# The file's naming format should e.g. be 'reddit-explorer.json'.
 	custom_fields = get_custom_fields(datasource)
 
-	return render_template("explorer/explorer.html", datasource=datasource, board=board, posts=posts, custom_css=css, custom_fields=custom_fields, posts_per_page=len(posts), post_count=len(posts), thread=thread_id, max_posts=max_posts)
+	return render_template("explorer/explorer.html", datasource=datasource, board=board, posts=posts, datasource_config=datasource_config, custom_fields=custom_fields, posts_per_page=len(posts), post_count=len(posts), thread=thread_id, max_posts=max_posts)
 
 @app.route('/explorer/post/<datasource>/<board>/<string:post_id>')
 @api_ratelimit
@@ -240,15 +230,11 @@ def explorer_database_posts(datasource, board, thread_id):
 	posts = [strip_html(post) for post in posts]
 	posts = [format(post) for post in posts]
 
-	# Include custom css if it exists in the datasource's 'explorer' dir.
-	# The file's naming format should e.g. be 'reddit-explorer.css'.
-	css = get_custom_css(datasource)
-
 	# Include custom fields if it they are in the datasource's 'explorer' dir.
 	# The file's naming format should e.g. be 'reddit-explorer.json'.
 	custom_fields = get_custom_fields(datasource)
 
-	return render_template("explorer/explorer.html", datasource=datasource, board=board, posts=posts, custom_css=css, custom_fields=custom_fields, posts_per_page=len(posts), post_count=len(posts))
+	return render_template("explorer/explorer.html", datasource=datasource, board=board, posts=posts, datasource_config=datasource_config, custom_fields=custom_fields, posts_per_page=len(posts), post_count=len(posts))
 
 @app.route("/explorer/save_annotation_fields/<string:key>", methods=["POST"])
 @api_ratelimit
@@ -571,23 +557,6 @@ def get_database_posts(db, datasource, ids, board="", threads=False, limit=0, of
 
 	return posts
 
-def get_custom_css(datasource):
-	"""
-	Check if there's custom CSS for this data source.
-	These can be inserted and edited on the Explorer settings page.
-	If these are absent, we revert to a standard template.
-
-	:param datasource, str: Datasource name
-
-	:return: The css as string.
-	"""
-
-	custom_css = config.get("explorer." + datasource + "-explorer-css", "")
-	if not custom_css:
-		custom_css = config.get("explorer." + datasource + "-search-explorer-css", "")
-
-	return custom_css
-
 def get_custom_fields(datasource, filetype=None):
 	"""
 	Check if there are custom fields that need to be showed for this datasource.

From 00993ed480dfe6edd892fd1d5b79e314db08dfef Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Fri, 19 Apr 2024 13:11:25 +0200
Subject: [PATCH 019/204] Fix annotation saving, improve CSS inclusions

---
 processors/filtering/write_annotations.py |   2 +-
 webtool/static/css/dataset-page.css       |  17 ++
 webtool/static/css/explorer/default.css   | 208 +---------------------
 webtool/static/css/explorer/tumblr.css    |   2 +-
 webtool/static/js/explorer.js             |  49 ++---
 webtool/templates/explorer/controls.html  |   2 +-
 webtool/templates/explorer/explorer.html  |  44 ++---
 webtool/templates/explorer/post.html      |   4 +-
 webtool/views/views_explorer.py           |  55 ++++--
 9 files changed, 112 insertions(+), 271 deletions(-)

diff --git a/processors/filtering/write_annotations.py b/processors/filtering/write_annotations.py
index 3e7d91cbe..c03ccb263 100644
--- a/processors/filtering/write_annotations.py
+++ b/processors/filtering/write_annotations.py
@@ -17,7 +17,7 @@ class WriteAnnotations(BasicProcessor):
 	type = "write-annotations"  # job type ID
 	category = "Filtering"  # category
 	title = "Write annotations"  # title displayed in UI
-	description = "Writes annotations from the Explorer to the dataset. Each input field will get a column. This creates a new dataset."  # description displayed in UI
+	description = "Writes annotations from the Explorer to the existing dataset. Each input field will get a column."  # description displayed in UI
 
 	options = {
 		"to-lowercase": {
diff --git a/webtool/static/css/dataset-page.css b/webtool/static/css/dataset-page.css
index c9b04c605..f65693e2d 100644
--- a/webtool/static/css/dataset-page.css
+++ b/webtool/static/css/dataset-page.css
@@ -716,4 +716,21 @@ body.image-preview {
 #explorer-posts, #explorer-posts > ol li {
     all: initial;
     padding: 0;
+}
+
+.explorer-content-container {
+    margin-top: 1em;
+}
+
+#explorer-content {
+    all: initial;
+    padding: 0;
+}
+
+#explorer-content .hidden {
+    min-height: 0;
+    max-height: 0;
+    overflow: hidden;
+    box-sizing: border-box;
+    border-width: 0;
 }
\ No newline at end of file
diff --git a/webtool/static/css/explorer/default.css b/webtool/static/css/explorer/default.css
index f726d741c..1b716da93 100644
--- a/webtool/static/css/explorer/default.css
+++ b/webtool/static/css/explorer/default.css
@@ -130,221 +130,15 @@ span.divider {
     cursor: default;
 }
 
-/** --------------------- *
-     Annotations editor
-  * --------------------- */
-#annotations-editor-container {
-    display: hidden;
-}
-
-/*#annotations-editor-container {
-    background: rgba(0, 0, 0, .4);
-    display: none;
-    height: 100%;
-    position: fixed;
-    top: 0;
-    left: 0;
-    width: 100%;
-    z-index: 9;
-}
-
-#annotations-editor {
-    width: 100px;
-    height: 500px;
-    position: absolute;
-    top:0;
-    bottom: 0;
-    left: 0;
-    right: 0;
-    margin: auto;
-    min-height: 100px;
-    background-color: #fff;
-    border-radius: 10px;
-    box-shadow: 10px 10px 60px #555;
-    width: 95%;
-    max-width: 620px;
-    padding: 15px 2%;
-}
-
-#annotation-fields-container {
-    max-height: 91%;
-    overflow-y: scroll;
-}
-
-#annotation-fields-container #annotation-headers {
-    display: flex;
-    align-items: flex-start;
-}
-
-#annotation-fields-container .annotation-header {
-    display: inline-block;
-    font-size: 14px;
-    font-weight: bold;
-}
-
-#annotation-fields-container .annotation-header#ah-label {
-    width: 235px;
-}
-#annotation-fields-container .annotation-header#ah-type {
-    width: 105px;
-}
-
-#annotation-fields .option-fields {
-    display: inline-block;
-}
-
-#annotation-fields .option-field {
-    display: block;
-}
-
-.annotation-fields-row {
-    display: flex;
-    align-items: flex-start;
-    margin-top: 5px;
-    padding: 2px;
-}
-
-.annotation-fields-row * {
-    margin-right: 3px;
-}
-
-.annotation-fields-row select, .annotation-fields-row button {
-    height:  30px;
-}
-
-.annotation-fields-row input {
-    height: 21px;
-    margin-bottom:3px;
-}
-
-.annotation-field-title {
-    display: inline-block;
-    font-size: 15px;
-}
-
-.annotation-field-title#at-label {
-    width: 150px;
-}
-
-#annotation-fields input.invalid {
-    border: 1px solid red;
-}
-
-#annotation-fields i.fas.fa-trash {
-    margin: 0;
-}
-
-#annotations-input-warning {
-    font-size: 14px;
-    color: red;
-    font-weight: bold;
-}
-
-#annotations-input-warning.hidden {
-    font-size: 14px;
-    color: red;
-    font-weight: bold;
-}
-
-#toggle-annotations {
-    width: 170px;
-}
-
-#annotations-input-warning.valid{
-    color: var(--accent-okay);
-}
-
-#close-annotation-fields {
-    position: absolute;
-    right: 0;
-    padding: 2px;
-    top: 0;
-    margin-right: 7px;
-    cursor: pointer;
-}
-
-#add-annotation-fields {
-    position: absolute;
-    bottom: 0;
-    right: 0;
-    margin: 15px;
-    text-align: right;
-    line-height: 0px;
-    margin-bottom: 4px;
-}
-
-#add-annotation-fields #notice {
-    font-size: 12px;
-    text-align: right;
-    color: var(--gray-darker);
-    cursor: default;
-}
-
-#annotation-controls {
-    position: fixed;
-    margin: 10px;
-    right: 0;
-    bottom:  0;
-    border-radius: 10px;
-    background-color: var(--gray);
-    z-index: 10;
-    box-shadow: 10px 10px 60px #555;
-}
-
-#annotation-controls #toggle-annotation-controls {
-    display: inline-block;
-    height: 50px;
-    width: 50px;
-    line-height: 50px;
-    text-align: center;
-    vertical-align: middle;
-    font-size: 15px;
-    cursor: pointer;
-}
-
-#toggle-annotation-fields {
-    margin-left: 10px;
-}
-
-#annotation-controls #annotation-controls-buttons {
-    display: inline-block;  
-}
-
-#annotation-controls #annotation-controls-buttons.hidden {
-    display: none;
-    transition: display 0s, opacity 0.5s linear;
-}
-
-.spinner {
-    -webkit-animation: spinner 2s linear infinite;
-}
-
-@-webkit-keyframes spinner {
-    from{
-        -webkit-transform: rotate(0deg);
-    }
-    to{
-        -webkit-transform: rotate(360deg);
-    }
-}
-
 /** --------------------- *
      Annotation post elements
   * --------------------- */
 .post-annotations {
     background-color: #bed4f9;
-    margin-top: 5px;
-    margin-right: 20px;
-    padding: 5px;
-    display: none;
-}
-    
-li.post.op > .post-annotations {
-    background-color: #3b4b8c;
 }
 
 .post-annotation {
-    line-height: 2.5em;
+    padding-top: 5px;
 }
 
 .post-annotation .annotation-label {
diff --git a/webtool/static/css/explorer/tumblr.css b/webtool/static/css/explorer/tumblr.css
index 1895e9961..e3ef2eaa8 100644
--- a/webtool/static/css/explorer/tumblr.css
+++ b/webtool/static/css/explorer/tumblr.css
@@ -1,4 +1,4 @@
-body {
+.explorer-content {
 	background-color: #001935;
 }
 
diff --git a/webtool/static/js/explorer.js b/webtool/static/js/explorer.js
index ac160e269..8748fae61 100644
--- a/webtool/static/js/explorer.js
+++ b/webtool/static/js/explorer.js
@@ -52,8 +52,13 @@ const annotations = {
 
 		// Show and hide annotations
 		$("#toggle-annotations").on("click", function(){
-			if (!$(this).hasClass("invalid")) {
-				annotations.toggleAnnotations();
+			if (!$(this).hasClass("disabled")) {
+				if ($(this).hasClass("shown")) {
+					annotations.hideAnnotations();
+				}
+				else {
+					annotations.showAnnotations();
+				}
 			}
 		});
 
@@ -155,7 +160,7 @@ const annotations = {
 		// no empty fields available, add a new one.
 		let no_empty_fields = true;
 		let input_fields = $(el).parent().siblings();
-		console.log(input_fields)
+
 		if (!$(el).val().length > 0) {
 				no_empty_fields = false;
 			}
@@ -237,7 +242,7 @@ const annotations = {
 				warning  = "Input names can't be empty";
 			}
 			// Make sure the names can't be duplicates.
-			if (labels_added.includes(label)) {
+			else if (labels_added.includes(label)) {
 				warning = "Fields must be unique";
 				label_field.addClass("invalid");
 			}
@@ -259,7 +264,6 @@ const annotations = {
 				let option_id = ""
 
 				options_field.find(".option-field > input").each(function(){
-					console.log(this)
 					let option_label = $(this).val();
 					let option_id = this.id.replace("input-", "");
 
@@ -563,9 +567,13 @@ const annotations = {
 		// Hide annotations if there's no fields leftover
 		var leftover_annotations = $(".post-annotations").first().find(".post-annotation");
 		if (leftover_annotations.length < 1) {
-			if ($(".post-annotations").first().is(':visible')) {
-				annotations.toggleAnnotations();
-			}
+			annotations.hideAnnotations();
+			$("#toggle-annotations").addClass("disabled");
+		}
+		// Else we're showing 'em
+		else {
+			annotations.showAnnotations();
+			$("#toggle-annotations").removeClass("disabled");
 		}
 
 		$("#apply-annotation-fields").html("<i class='fas fa-check'></i> Apply")
@@ -620,7 +628,7 @@ const annotations = {
 
 		$(".posts > li").each(function(){
 
-			let post_id = this.id.split("-")[1];
+			let post_id = this.id.replace("post-", "");
 			let vals_changed = false;
 			let post_annotations = $(this).find(".post-annotations");
 
@@ -772,18 +780,19 @@ const annotations = {
 		}
 	},
 
-	toggleAnnotations: function() {
+	showAnnotations: function() {
 		let ta = $("#toggle-annotations");
-		if (ta.hasClass("shown")) {
-			ta.removeClass("shown");
-			ta.html("<i class='fas fa-eye'></i> Show annotations");
-			$(".post-annotations").addClass("hidden");
-		}
-		else {
-			ta.addClass("shown");
-			ta.html("<i class='fas fa-eye-slash'></i> Hide annotations");
-			$(".post-annotations").removeClass("hidden");
-		}
+		ta.addClass("shown");
+		ta.removeClass("disabled");
+		ta.html("<i class='fas fa-eye-slash'></i> Hide annotations");
+		$(".post-annotations").removeClass("hidden");
+	},
+
+	hideAnnotations: function() {
+		let ta = $("#toggle-annotations");
+		ta.removeClass("shown");
+		ta.html("<i class='fas fa-eye'></i> Show annotations");
+		$(".post-annotations").addClass("hidden");
 	},
 
 	getAnnotationsDiv: function(id){
diff --git a/webtool/templates/explorer/controls.html b/webtool/templates/explorer/controls.html
index bf1921896..c25a8a5c7 100644
--- a/webtool/templates/explorer/controls.html
+++ b/webtool/templates/explorer/controls.html
@@ -25,7 +25,7 @@ <h2>
 						<li>
 							<a class="button-like-small" id="toggle-annotation-fields"><i class="fas fa-edit"></i> Edit fields</a>
 						</li>
-						<li><a class="button-like-small {% if annotation_fields %}disabled{% endif %}" id="toggle-annotations"><i class="fas fa-eye"></i> Show annotations</li></a>
+						<li><a class="button-like-small {% if not annotation_fields %}disabled{% endif %}" id="toggle-annotations"><i class="fas fa-eye"></i> Show annotations</li></a>
 						<li><a class="button-like-small {% if annotation_fields and annotations %}disabled{% endif %}" id="save-annotations"><i class="fas fa-save"></i> Save annotations</li></a>
 						<li><a class="button-like-small {% if annotation_fields and annotations %}disabled{% endif %}" id="save-to-dataset"><i class="fa-solid fa-right-from-bracket"></i> Write to dataset</li></a>
 					</ul>
diff --git a/webtool/templates/explorer/explorer.html b/webtool/templates/explorer/explorer.html
index d2d6e3b9a..250d6bf03 100644
--- a/webtool/templates/explorer/explorer.html
+++ b/webtool/templates/explorer/explorer.html
@@ -1,12 +1,11 @@
 {% extends "layout.html" %}
 
-{% block title %}Datasets &amp; previous results{% endblock %}
-{% block body_class %}result-list plain-page{% endblock %}
-{% block breadcrumbs %}{% set navigation.current = "dataset" %}{% endblock %}
+{% block title %}Explorer: {{ dataset.get_label() }} • 4CAT{% endblock %}
+{% block breadcrumbs %}{% set navigation.current = "explorer" %}{% endblock %}
 
 {% block body %}
 
-
+<!-- Some custom behaviour here -->
 <script type="text/javascript" src="{{url_for('static', filename='js/explorer.js')}}"></script>
 
 <!-- Possible annotation fields -->
@@ -14,32 +13,35 @@
 var annotation_fields = {% if annotation_fields %}{{ annotation_fields | safe }}{% else %}""{% endif %}
 </script>
 
-
 {% set key = dataset.data.key %}
 
+<!-- Control toolbox, with annotation editor -->
 {% include "explorer/controls.html" %}
 
+<!-- Page numbers -->
 {% include "explorer/pagination.html" %}
 
-<!-- CSS: Use generic template, data source preset, or custom -->
-{% if "css" in datasource_config %}
-	{% if datasource_config.css == "preset" %}
-		<link rel="stylesheet" type="text/css" href="{{url_for('static', filename='css/explorer/')}}{{ datasource}}.css">
-	{% elif datasource_config.css == "custom" %}
-		<style>{{ datasource_config.custom_css | safe }}</style>
-	{% else %}
-		<link rel="stylesheet" type="text/css" href="{{url_for('static', filename='css/explorer/default.css')}}">
-	{% endif %}
-{% endif %}
+<!-- CSS: Use generic template, data source preset, or custom. -->
+<!-- Use a nested class so the style is only applied to posts. -->
+<style type="text/css">
+	.explorer-content-container {
+		{{ posts_css | safe }}
+	}
+</style>
 
+<!-- Flags for chan posts -->
 <link rel="stylesheet" type="text/css" href="{{url_for('static', filename='css/flags.css')}}">
 
-<div class="content">
-	<ol class="posts" id="explorer-posts">
-	{% for post in posts %}
-		{% include "explorer/post.html" %}
-	{% endfor %}
-	</ol>
+
+<!-- The actual dataset posts -->
+<div class="explorer-content-container">
+	<div class="explorer-content">
+		<ol class="posts" id="explorer-posts">
+		{% for post in posts %}
+			{% include "explorer/post.html" %}
+		{% endfor %}
+		</ol>
+	</div>
 </div>
 
 {% include "explorer/pagination.html" %}
diff --git a/webtool/templates/explorer/post.html b/webtool/templates/explorer/post.html
index 7c8cca3ce..208f9457d 100644
--- a/webtool/templates/explorer/post.html
+++ b/webtool/templates/explorer/post.html
@@ -27,7 +27,6 @@
 				{% set author = post.author %}
 			{% endif %}
 			{{ author }}
-
 		{% endif %}
 		</span>
 		{% if 'thread_id' in post %}
@@ -120,7 +119,6 @@
 		
 		{% if annotation_fields %}
 			{% set old_annotations = None %}
-
 			{% if annotations and post.id in annotations %}
 				{% set old_annotations = annotations[post.id] %}
 			{% endif %}
@@ -130,13 +128,13 @@
 				{% set type = annotation_fields[field]["type"] %}
 				{% set label = annotation_fields[field]["label"] %}
 				{% set old_annotation = "" %}
-
 				{% if old_annotations and label in old_annotations %}
 					{% set old_annotation = old_annotations[label] %}
 				{% endif %}
 				
 				<div class="post-annotation field-{{ field }} {{ type }}"><label class="annotation-label">{{ label }}</label>
 				{% if type == 'text' %}
+				{{ old_annotation }}
 					<input type="text" class="post-annotation-input text-{{ field }}" value="{{ old_annotation }}">
 
 				{% elif type == 'textarea' %}
diff --git a/webtool/views/views_explorer.py b/webtool/views/views_explorer.py
index 80f265a57..b8fd77ae6 100644
--- a/webtool/views/views_explorer.py
+++ b/webtool/views/views_explorer.py
@@ -56,6 +56,8 @@ def explorer_dataset(key, page=1):
 	datasource = parameters["datasource"]
 	post_count = int(dataset.data["num_rows"])
 	annotation_fields = dataset.get_annotation_fields()
+	print("AAAAAAAAA", annotation_fields)
+	datasource_config = config.get("explorer.config", {}).get(datasource,{})
 
 	# See if we can actually serve this page
 	if dataset.is_private and not (config.get("privileges.can_view_all_datasets") or dataset.is_accessible_by(current_user)):
@@ -71,7 +73,6 @@ def explorer_dataset(key, page=1):
 	if not config.get("explorer.config", {}).get(datasource,{}).get("enabled"):
 		return error(404, error="Explorer functionality disabled for %s." % datasource)
 
-	datasource_config = config.get("explorer.config", {}).get(datasource,{})
 
 	# The amount of posts to show on a page
 	posts_per_page = config.get("explorer.posts_per_page", 50)
@@ -127,12 +128,11 @@ def explorer_dataset(key, page=1):
 			if count >= (offset + posts_per_page) or count > max_posts:
 				break
 
-	# Include custom fields if it they are in the datasource's 'explorer' dir.
-	# The file's naming format should e.g. be 'reddit-explorer.json'.
-	# For some datasources (e.g. Twitter) we also have to explicitly set
-	# what data type we're working with.
-	filetype = dataset.get_extension()
-	custom_fields = get_custom_fields(datasource, filetype=filetype)
+	# Include custom fields if it they are in the 'explorer' dir.
+	custom_fields = get_custom_fields(datasource)
+
+	# Include CSS: a generic template, a data source preset, or custom.
+	posts_css = get_css(datasource, css_type=datasource_config.get("css", "general"))
 
 	# Convert posts from markdown to HTML
 	if custom_fields and "markdown" in custom_fields and custom_fields.get("markdown"):
@@ -152,9 +152,9 @@ def explorer_dataset(key, page=1):
 		annotations = None
 	else:
 		annotations = json.loads(annotations["annotations"])
-
+	
 	# Generate the HTML page
-	return render_template("explorer/explorer.html", dataset=dataset, datasource=datasource, has_database=has_database, parameters=parameters, posts=posts, annotation_fields=annotation_fields, annotations=annotations, datasource_config=datasource_config, custom_fields=custom_fields, page=page, offset=offset, posts_per_page=posts_per_page, post_count=post_count, max_posts=max_posts)
+	return render_template("explorer/explorer.html", dataset=dataset, datasource=datasource, has_database=has_database, parameters=parameters, posts=posts, annotation_fields=annotation_fields, annotations=annotations, datasource_config=datasource_config, posts_css=posts_css, custom_fields=custom_fields, page=page, offset=offset, posts_per_page=posts_per_page, post_count=post_count, max_posts=max_posts)
 
 @app.route('/results/<datasource>/<string:thread_id>/explorer')
 @api_ratelimit
@@ -579,7 +579,7 @@ def get_custom_fields(datasource, filetype=None):
 	else:
 		datasource_dir = datasource
 
-	json_path = Path(config.get('PATH_ROOT'), "datasources", datasource_dir, "explorer", datasource.lower() + "-explorer.json")
+	json_path = Path(config.get('PATH_ROOT'), "webtool/static/js/explorer/", datasource + ".json")
 	read = False
 
 	if json_path.exists():
@@ -598,16 +598,37 @@ def get_custom_fields(datasource, filetype=None):
 				return ("invalid", e)
 	else:
 		custom_fields = None
-
-	filetype = filetype.replace(".", "")
-	if filetype and custom_fields:
-		if filetype in custom_fields:
-			custom_fields = custom_fields[filetype]
-	else:
-		custom_fields = None
 		
 	return custom_fields
 
+def get_css(datasource, css_type):
+	"""
+	Check if there's a custom css file for this dataset.
+	If so, return the text.
+	Custom css files should be placed in the webtool/static/css/explorer/ folder with the name of the datasource (e.g. 'webtool/static/css/explorer/reddit.css').
+
+	:param datasource, str:	Datasource name
+	:param css_type, str:		`general` or `preset`.
+
+	:return: The css as string.
+
+	"""
+
+	if css_type == "preset":
+		css_path = Path(config.get('PATH_ROOT'), "webtool/static/css/explorer/", datasource + ".css")
+	elif css_type == "general":
+		css_path = Path(config.get('PATH_ROOT'), "webtool/static/css/explorer/default.css")
+	else:
+		return ""
+
+	if not css_path.exists():
+		return ""
+
+	with open(css_path, "r", encoding="utf-8") as css:
+		css = css.read()
+
+	return css
+
 def get_nested_value(di, keys):
 	"""
 	Gets a nested value on the basis of a dictionary and a list of keys.

From 7149a6dff195c70a33f8d6555f1a10221c87a219 Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Mon, 22 Apr 2024 15:17:33 +0200
Subject: [PATCH 020/204] Make sure annotations are kept in NDJSON and CSV,
 change custom fields functionality, start Instagram template

---
 backend/lib/processor.py                      |   2 +-
 common/lib/config_definition.py               |   8 +-
 common/lib/dataset.py                         |  13 ++
 datasources/instagram/search_instagram.py     |   2 +
 datasources/tumblr/search_tumblr.py           |   6 +-
 processors/filtering/write_annotations.py     |   4 +-
 webtool/lib/template_filters.py               |  92 ---------
 webtool/static/css/explorer/default.css       |  46 ++---
 webtool/static/css/explorer/instagram.css     |  27 ++-
 webtool/static/js/explorer.js                 |   2 +-
 webtool/templates/explorer/controls.html      |  16 +-
 .../datasource_templates/default.html         | 125 ++++++++++++
 .../datasource_templates/instagram.html       |  11 ++
 webtool/templates/explorer/explorer.html      |   7 +-
 .../templates/explorer/post-annotations.html  |  51 +++++
 webtool/templates/explorer/post.html          | 179 ++----------------
 webtool/views/views_explorer.py               | 102 ++--------
 17 files changed, 282 insertions(+), 411 deletions(-)
 create mode 100644 webtool/templates/explorer/datasource_templates/default.html
 create mode 100644 webtool/templates/explorer/datasource_templates/instagram.html
 create mode 100644 webtool/templates/explorer/post-annotations.html

diff --git a/backend/lib/processor.py b/backend/lib/processor.py
index ecfea2e67..24b7b4a11 100644
--- a/backend/lib/processor.py
+++ b/backend/lib/processor.py
@@ -400,7 +400,7 @@ def add_field_to_parent(self, field_name, new_data, which_parent=source_dataset,
 
 		TODO: could be improved by accepting different types of data depending on csv or ndjson.
 
-		:param str field_name: 	name of the desired
+		:param str field_name: 	Name of the desired new field
 		:param List new_data: 	List of data to be added to parent dataset
 		:param DataSet which_parent: 	DataSet to be updated (e.g., self.source_dataset, self.dataset.get_parent(), self.dataset.top_parent())
 		:param bool update_existing: 	False (default) will raise an error if the field_name already exists
diff --git a/common/lib/config_definition.py b/common/lib/config_definition.py
index 636bb3c82..4e9b5a21a 100644
--- a/common/lib/config_definition.py
+++ b/common/lib/config_definition.py
@@ -347,7 +347,7 @@
     "explorer.config": {
         "type": UserInput.OPTION_DATASOURCES_TABLE,
         "help": "Explorer settings per data source",
-        "default": {"fourchan": {"enabled": True, "css": "preset", "fields": "general"}, "eightchan": {"enabled": True, "css": "general", "fields": "general"}, "eightkun": {"enabled": True, "css": "general", "fields": "general"}, "ninegag": {"enabled": True, "css": "general", "fields": "general"}, "bitchute": {"enabled": True, "css": "general", "fields": "general"}, "dmi-tcat": {"enabled": True, "css": "general", "fields": "general"}, "dmi-tcatv2": {"enabled": True, "css": "general", "fields": "general"}, "douban": {"enabled": True, "css": "general", "fields": "general"}, "douyin": {"enabled": False, "css": "general", "fields": "general"}, "imgur": {"enabled": True, "css": "general", "fields": "general"}, "upload": {"enabled": True, "css": "general", "fields": "general"}, "instagram": {"enabled": True, "css": "preset", "fields": "preset"}, "linkedin": {"enabled": True, "css": "general", "fields": "general"}, "parler": {"enabled": True, "css": "general", "fields": "general"}, "reddit": {"enabled": True, "css": "preset", "fields": "preset"}, "telegram": {"enabled": True, "css": "general", "fields": "general"}, "tiktok": {"enabled": True, "css": "preset", "fields": "preset"}, "tiktok-urls": {"enabled": True, "css": "preset", "fields": "preset"}, "tumblr": {"enabled": True, "css": "preset", "fields": "preset"}, "twitter": {"enabled": True, "css": "preset", "fields": "preset"}, "twitterv2": {"enabled": True, "css": "preset", "fields": "preset"}, "usenet": {"enabled": True, "css": "general", "fields": "general"}, "vk": {"enabled": True, "css": "general", "fields": "general"}},
+        "default": {"fourchan": {"enabled": True, "css": "preset", "template": "general"}, "eightchan": {"enabled": True, "css": "general", "template": "general"}, "eightkun": {"enabled": True, "css": "general", "template": "general"}, "ninegag": {"enabled": True, "css": "general", "template": "general"}, "bitchute": {"enabled": True, "css": "general", "template": "general"}, "dmi-tcat": {"enabled": True, "css": "general", "template": "general"}, "dmi-tcatv2": {"enabled": True, "css": "general", "template": "general"}, "douban": {"enabled": True, "css": "general", "template": "general"}, "douyin": {"enabled": False, "css": "general", "template": "general"}, "imgur": {"enabled": True, "css": "general", "template": "general"}, "upload": {"enabled": True, "css": "general", "template": "general"}, "instagram": {"enabled": True, "css": "preset", "template": "preset"}, "linkedin": {"enabled": True, "css": "general", "template": "general"}, "parler": {"enabled": True, "css": "general", "template": "general"}, "reddit": {"enabled": True, "css": "preset", "template": "preset"}, "telegram": {"enabled": True, "css": "general", "template": "general"}, "tiktok": {"enabled": True, "css": "preset", "template": "preset"}, "tiktok-urls": {"enabled": True, "css": "preset", "template": "preset"}, "tumblr": {"enabled": True, "css": "preset", "template": "preset"}, "twitter": {"enabled": True, "css": "preset", "template": "preset"}, "twitterv2": {"enabled": True, "css": "preset", "template": "preset"}, "usenet": {"enabled": True, "css": "general", "template": "general"}, "vk": {"enabled": True, "css": "general", "template": "general"}},
         "columns": {
             "enabled": {
                 "type": UserInput.OPTION_TOGGLE,
@@ -355,15 +355,15 @@
                 "tooltip": "Whether the Explorer is available for this data source",
                 "default": True
             },
-            "fields": {
+            "template": {
                 "type": UserInput.OPTION_CHOICE,
-                "help": "Fields",
+                "help": "Template",
                 "options": {
                     "general": "General",
                     "preset": "Preset"
                 },
                 "default": "general",
-                "tooltip": "What fields to use (see explanation above)"
+                "tooltip": "What template to use (see explanation above)"
             },
             "css": {
                 "type": UserInput.OPTION_CHOICE,
diff --git a/common/lib/dataset.py b/common/lib/dataset.py
index 5eb49c37c..603e3a352 100644
--- a/common/lib/dataset.py
+++ b/common/lib/dataset.py
@@ -338,6 +338,10 @@ def iterate_items(self, processor=None, warn_unmappable=True, map_missing="defau
 		if own_processor.map_item_method_available(dataset=self):
 			item_mapper = True
 
+		# Annotation fields are dynamically added,
+		# so we're always going to accept these.
+		annotation_fields = self.get_annotation_fields()
+
 		# Loop through items
 		for i, item in enumerate(self._iterate_items(processor)):
 			# Save original to yield
@@ -382,6 +386,15 @@ def iterate_items(self, processor=None, warn_unmappable=True, map_missing="defau
 
 			else:
 				mapped_item = original_item
+			
+			# Re-add annotation fields to a mapped item.
+			if annotation_fields:
+				for annotation_field in annotation_fields.values():
+					label = annotation_field["label"]
+					if type(mapped_item) is MappedItem:
+						mapped_item.data[label] = original_item.get(label, "")
+					else:
+						mapped_item[label] = original_item.get(label, "")
 
 			# yield a DatasetItem, which is a dict with some special properties
 			yield DatasetItem(mapper=item_mapper, original=original_item, mapped_object=mapped_item, **(mapped_item.get_item_data() if type(mapped_item) is MappedItem else mapped_item))
diff --git a/datasources/instagram/search_instagram.py b/datasources/instagram/search_instagram.py
index daa42471d..d086e73ab 100644
--- a/datasources/instagram/search_instagram.py
+++ b/datasources/instagram/search_instagram.py
@@ -138,6 +138,7 @@ def parse_graph_item(node):
             "timestamp": datetime.datetime.fromtimestamp(node["taken_at_timestamp"]).strftime("%Y-%m-%d %H:%M:%S"),
             "author": user.get("username", owner.get("username", MissingMappedField(""))),
             "author_fullname": user.get("full_name", owner.get("full_name", MissingMappedField(""))),
+            "is_verified": True if user.get("is_verified") else False,
             "author_avatar_url": user.get("profile_pic_url", owner.get("profile_pic_url", MissingMappedField(""))),
             "type": media_type,
             "url": "https://www.instagram.com/p/" + node["shortcode"],
@@ -227,6 +228,7 @@ def parse_itemlist_item(node):
             "body": caption,
             "author": user.get("username", owner.get("username", MissingMappedField(""))),
             "author_fullname": user.get("full_name", owner.get("full_name", MissingMappedField(""))),
+            "is_verified": True if user.get("is_verified") else False,
             "author_avatar_url": user.get("profile_pic_url", owner.get("profile_pic_url", MissingMappedField(""))),
             "timestamp": datetime.datetime.fromtimestamp(node["taken_at"]).strftime("%Y-%m-%d %H:%M:%S"),
             "type": media_type,
diff --git a/datasources/tumblr/search_tumblr.py b/datasources/tumblr/search_tumblr.py
index 191fec22e..89784b9e3 100644
--- a/datasources/tumblr/search_tumblr.py
+++ b/datasources/tumblr/search_tumblr.py
@@ -114,7 +114,7 @@ def get_options(cls, parent_dataset=None, user=None):
 		}
 
 		try:
-			config_keys = SearchTumblr.get_tumbler_keys(user)
+			config_keys = SearchTumblr.get_tumblr_keys(user)
 		except ConfigException:
 			# No 4CAT set keys for user; let user input their own
 			options["key-info"] = {
@@ -666,7 +666,7 @@ def get_post_by_id(self, blog_name, post_id):
 		return result
 
 	@staticmethod
-	def get_tumbler_keys(user):
+	def get_tumblr_keys(user):
 		config_keys = [
 			config.get("api.tumblr.consumer_key", user=user),
 			config.get("api.tumblr.consumer_secret", user=user),
@@ -688,7 +688,7 @@ def connect_to_tumblr(self):
 			self.parameters.get("secret_key")]
 		if not all(config_keys):
 			# No user input keys; attempt to use 4CAT config keys
-			config_keys = self.get_tumbler_keys(self.owner)
+			config_keys = self.get_tumblr_keys(self.owner)
 
 		self.client = pytumblr.TumblrRestClient(*config_keys)
 
diff --git a/processors/filtering/write_annotations.py b/processors/filtering/write_annotations.py
index c03ccb263..c7333b0a0 100644
--- a/processors/filtering/write_annotations.py
+++ b/processors/filtering/write_annotations.py
@@ -17,7 +17,7 @@ class WriteAnnotations(BasicProcessor):
 	type = "write-annotations"  # job type ID
 	category = "Filtering"  # category
 	title = "Write annotations"  # title displayed in UI
-	description = "Writes annotations from the Explorer to the existing dataset. Each input field will get a column."  # description displayed in UI
+	description = "Writes annotations from the Explorer to the existing dataset. Each input field will get a new column."  # description displayed in UI
 
 	options = {
 		"to-lowercase": {
@@ -101,7 +101,7 @@ def process(self):
 
 		# Write to top dataset
 		for label, values in new_data.items():
-			self.add_field_to_parent("annotation_" + label, values, which_parent=self.source_dataset, update_existing=True)
+			self.add_field_to_parent(label, values, which_parent=self.source_dataset, update_existing=True)
 		
 		self.dataset.update_status("Annotations written to parent dataset.")
 		self.dataset.finish(self.source_dataset.num_rows)
\ No newline at end of file
diff --git a/webtool/lib/template_filters.py b/webtool/lib/template_filters.py
index dd04d3eb1..1e8ae889c 100644
--- a/webtool/lib/template_filters.py
+++ b/webtool/lib/template_filters.py
@@ -229,98 +229,6 @@ def _jinja2_filter_4chan_image(image_4chan, post_id, board, image_md5):
 	return "retrieve:https://archived.moe/_/search/image/" + image_md5
 
 
-
-@app.template_filter('dict_field')
-def _jinja2_filter_dict_field(field, post):
-	# Extracts string values between {{ two curly brackets }} and uses that
-	# as a dictionary key for the given dict. It then returns the corresponding value.
-	# Mainly used in the Explorer.
-
-	matches = False
-	formatted_field = field
-
-	field = str(field)
-	
-	for key in re.findall(r"\{\{(.*?)\}\}", field):
-
-		original_key = key
-
-		# Remove possible slice strings so we get the original key
-		string_slice = None
-		if "[" in original_key and "]" in original_key:
-			string_slice = re.search(r"\[(.*?)\]", original_key)
-			if string_slice:
-				string_slice = string_slice.group(1)
-				key = key.replace("[" + string_slice + "]", "")
-
-		# We're also gonna extract any other filters present
-		extra_filters = []
-		if "|" in key:
-			extra_filters = key.split("|")[1:]
-			key = key.split("|")[0]
-
-		# They keys can also be subfields (e.g. "author.username")
-		# So we're splitting and looping until we get the value.
-		keys = key.split(".")
-		val = post
-
-		for k in keys:
-			if isinstance(val, list):
-				val = val[0]
-			if isinstance(val, dict):
-				val = val.get(k.strip(), "")
-
-		# Return nothing if one of the fields is not found.
-		# We see 0 as a valid value - e.g. '0 retweets'.
-		if not val and val != 0:
-			return ""
-		
-		# Support some basic string slicing
-		if string_slice:
-			field = field.replace("[" + string_slice + "]", "")
-			if ":" not in string_slice:
-				string_slice = slice(int(string_slice), int(string_slice) + 1)
-			else:
-				sl = string_slice.split(":")
-				if not sl[0] and sl[0] != "0":
-					sl1 = 0
-					sl2 = sl[1]
-				elif not sl[-1]:
-					sl1 = sl[0]
-					sl2 = len(st)
-				else:
-					sl1 = sl[0]
-					sl2 = sl[1]
-				string_slice = slice(int(sl1), int(sl2))
-
-		# Apply further filters, if present (e.g. lower)
-		for extra_filter in extra_filters:
-			
-			extra_filter = extra_filter.strip()
-
-			# We're going to parse possible parameters to pass to the filter
-			# These are passed as unnamed variables to the function.
-			params = ()
-			if "(" in extra_filter:
-				params = extra_filter.split("(")[-1][:-1].strip()
-				extra_filter = extra_filter.split("(")[0]
-				params = [p.strip() for p in params.split(",")]
-				params = [post[param] for param in params]
-			
-			val = app.jinja_env.filters[extra_filter](val, *params)
-
-		if string_slice:
-			val = val[string_slice]
-
-		# Extract single list item
-		if isinstance(val, list) and len(val) == 1:
-			val = val[0]
-
-		formatted_field = formatted_field.replace("{{" + original_key + "}}", str(val))
-
-	return formatted_field
-
-
 @app.template_filter('parameter_str')
 def _jinja2_filter_parameter_str(url):
 	# Returns the current URL parameters as a valid string.
diff --git a/webtool/static/css/explorer/default.css b/webtool/static/css/explorer/default.css
index 1b716da93..02626b458 100644
--- a/webtool/static/css/explorer/default.css
+++ b/webtool/static/css/explorer/default.css
@@ -30,11 +30,12 @@ See https://github.com/digitalmethodsinitiative/4cat/wiki/Exploring-and-annotati
     src: url("../fonts/OpenSans-BoldItalic.ttf")
 }
 
-
 /** --------------------- *
      Posts
   * --------------------- */
-.posts {
+
+.post {
+    font-family: "Open Sans";
     list-style: none;
     padding: 0;
 }
@@ -61,8 +62,8 @@ See https://github.com/digitalmethodsinitiative/4cat/wiki/Exploring-and-annotati
 }
 
 .posts article .post-content {
-    display: inline;
-    vertical-align: top;
+    margin-top: 10px;
+    margin-bottom: 10px;
 }
 
 .posts header .author {
@@ -78,14 +79,8 @@ See https://github.com/digitalmethodsinitiative/4cat/wiki/Exploring-and-annotati
     margin-left: 0.5em;
 }
 
-.posts li.post.op {
-    border: 2px solid #000;
-    background: #485ba6;
-    color: #FFF;
-}
-
-.posts li.post.op header {
-    border-color: #FFF;
+.tags, .metrics, .external-url {
+    color: #3b4f9d;
 }
 
 .posts li.post .post-content a {
@@ -104,24 +99,9 @@ See https://github.com/digitalmethodsinitiative/4cat/wiki/Exploring-and-annotati
     cursor: not-allowed;
 }
 
-.quote {
-    color: #ff5750;
-}
-
-.highlight, :target {
-    outline: 2px solid #ff5750;
-}
-li a {
-    color: inherit;
-    text-decoration: none;
-}
-
-.posts li .post-image img {
+.post-media img {
     width: 100%;
-}
-
-.clear {
-    clear: both;
+    max-width: 300px;
 }
 
 span.divider {
@@ -138,15 +118,15 @@ span.divider {
 }
 
 .post-annotation {
-    padding-top: 5px;
+    padding: 15px;
 }
 
-.post-annotation .annotation-label {
+.post-annotation > .annotation-label {
     display: inline-block;
     vertical-align: middle;
     font-weight: bold;
     text-align: right;
-    width: 130px;
+    min-width: 150px;
     margin-right: 5px;
     line-height: 1.6em;
     overflow-x: hidden;
@@ -166,4 +146,4 @@ span.divider {
 }
 
 .posts .external-url {
-}*/
\ No newline at end of file
+}
\ No newline at end of file
diff --git a/webtool/static/css/explorer/instagram.css b/webtool/static/css/explorer/instagram.css
index 63bc05fb7..86d9bd990 100644
--- a/webtool/static/css/explorer/instagram.css
+++ b/webtool/static/css/explorer/instagram.css
@@ -2,26 +2,35 @@
 	color: black;
 }
 
-h1 span {
-	color: white;
-}
-
-body {
-	background-color: white;
-}
-
 .posts li.post {
-	max-width: 225px;
+	margin: 0 auto;
+	max-width: 500px;
 	background-color: white;
 	font-family: "Segoe UI", Roboto, Helvetica, Arial, sans-serif;
 	font-size: 14px;
 	border-bottom: 1px solid grey;
+	list-style-type: none;
+	padding-top: 30px;
+	padding-bottom: 30px;
 }
 
 .posts header {
 	border: none;
 }
 
+.posts header span:not(:last-child)::after {
+    content: '\2022';
+    margin-left: 0.5em;
+}
+
+.posts header i.verified {
+	color: #0095f6;
+}
+
+.time {
+	color: #7b7b7b;
+}
+
 .posts .alt, .posts .alt time {
 	color: grey;
 }
diff --git a/webtool/static/js/explorer.js b/webtool/static/js/explorer.js
index 8748fae61..6cf8aed9e 100644
--- a/webtool/static/js/explorer.js
+++ b/webtool/static/js/explorer.js
@@ -826,7 +826,7 @@ const annotations = {
 		if (id == undefined || id == 0) {
 			id = annotations.randomInt();
 		}
-		return "<div class='option-field'><input type='text' id='input-" + id + "' placeholder='Value'></div>";
+		return "<dd class='option-field'><input type='text' id='input-" + id + "' placeholder='Value'></dd>";
 	},
 
 	randomInt: function(){
diff --git a/webtool/templates/explorer/controls.html b/webtool/templates/explorer/controls.html
index c25a8a5c7..f34fdf41d 100644
--- a/webtool/templates/explorer/controls.html
+++ b/webtool/templates/explorer/controls.html
@@ -4,21 +4,19 @@ <h2>
 			{{ dataset.get_label() }} - Explorer
 		</h2>
 		<span style="display: none" id="dataset-key">{{ key }}</span>
-		{% if custom_fields and custom_fields[0] == "invalid" %}
-		<div class="fullwidth-notice">
-			Invalid custom fields JSON - can't show posts properly ({{ custom_fields[1] }}).
-		</div>
-		{% endif %}
-		{% if key and post_count > max_posts %}
-		<div class="fullwidth-notice">Large dataset - only showing the first {{ max_posts }} posts. Use filter processors to limit the dataset.</div>
-		{% set post_count = max_posts %}
-		{% endif %}
 
 		<div class="card-content">
 			<dl class="metadata-wrapper">
 				<div class="fullwidth notice">
 					<a href="/results/{{ key }}"><i class="fas fa-arrow-left"></i> Return to dataset overview</a>
 				</div>
+				{% if warning %}
+				<div class="fullwidth notice">{{ warning }}</div>
+				{% endif %}
+				{% if key and post_count > max_posts %}
+				<div class="fullwidth notice">Large dataset - only showing the first {{ max_posts }} posts. Use filter processors to limit the dataset.</div>
+				{% set post_count = max_posts %}
+				{% endif %}
 				<div class="fullwidth dataset-toolbox">
 					<dt>Annotations</dt>
 					<ul>
diff --git a/webtool/templates/explorer/datasource_templates/default.html b/webtool/templates/explorer/datasource_templates/default.html
new file mode 100644
index 000000000..dcd79cb2d
--- /dev/null
+++ b/webtool/templates/explorer/datasource_templates/default.html
@@ -0,0 +1,125 @@
+<!-- For the default template, we're registering a few common
+names for fields. The last encountered field in these lists will be used,
+so should be ordered in decreasing terms of importance. -->
+{%
+set all_known_fields = {
+	"author": ["author", "author_name", "author_fullname", "nickname"],
+	"time": ["created_utc", "timestamp", "time"],
+	"title": ["title", "subject"],
+	"body": ["body", "message"],
+	"media": ["image", "images", "image_url"],
+	"tags": ["hashtags", "tags"],
+	"views": ["views", "num_views"],
+	"likes": ["likes", "num_likes", "notes"],
+	"comments": ["comments", "num_comments", "reactions"],
+	"shares": ["shares", "num_shares"],
+	"url": ["url", "link_url", "post_url"]
+}
+%}
+
+{% set fields = {} %}
+
+<!-- Post ID and thread ID are always included -->
+{% set x=fields.__setitem__("id", post["id"]) %}
+{% set x=fields.__setitem__("thread_id", post["thread_id"]) %}
+
+<!-- Set all default field names for a post, as registered above -->
+{% for field, known_fields in all_known_fields.items() %}
+	{% for known_field in known_fields %}
+		{% if known_field in post and post[known_field] %}
+			{% set x=fields.__setitem__(field, post[known_field]) %}
+		{% endif %}
+	{% endfor %}
+{% endfor %}
+
+<!-- Possible external link, if not pseudonymised -->
+{% if "url" in fields and pseudonymised %}
+	<span class="external-url deactivated" title="External URLs unavailable for pseudonymised datasets"><i class="fas fa-external-link-alt"></i></span>
+{% elif "url" in fields and not pseudonymised %}
+	<a href="{{ fields.url }}" target="_blank"><span class="external-url" title="Go to original post"><i class="fas fa-external-link-alt"></i></span></a>
+{% endif %}
+
+<!-- Post header -->
+<header>
+	<span title="ID" class="id">{{ fields.get("id") }}</span>
+	<span title="Thread ID" class="thread_id">{{ fields.get("thread_id") }}</span>
+	
+	<!-- Author name -->
+	{% if pseudonymised %}
+		<span title="Pseudonymous author" class="author">
+		<i class="fa fa-user-secret tooltip-trigger"></i>
+	{% else %}
+		<span title="Author" class="author">
+		{{ fields.get("author") }}
+	{% endif %}
+	</span>
+
+	<!-- Post title -->
+	{% if "title" in post and post["title"] %}
+		<span class="title">{{post.title}}</span>
+	{% endif %}
+
+	<!-- Post time -->
+	{% if fields.get("time") is integer %}
+		<span title="Date" class="datetime">{{ fields.get("time")|datetime('%Y-%m-%d %H:%M')|safe }}</span>
+	{% else %}
+		<span title="Date" class="datetime">{{ fields.get("time") }}</span>
+	{% endif %}
+
+</header>
+
+<!-- Post content-->
+<article>
+
+<!-- Media item -->
+{% if fields.get("media") %}
+	<div class="post-media">
+	<!-- Split media urls if there's commas in the field -->
+	{% if "," in fields["media"] %}
+		{% set media_urls = fields["media"].split(",") %}
+		{% for media_url in media_urls %}
+		<img class="gallery" src="{{ media_url }}">
+		{% endfor %}
+	{% else %}
+		<img src="{{ fields['media'] }}">
+	{% endif %}
+	</div>
+	{% endif %}
+
+	<!-- Post body -->
+	<div class="post-content">
+		{{ fields.body | safe }}
+	</div>
+
+	<!-- Tags -->
+	{% if fields.get("tags") %}
+		<div class="tags">
+			{{ fields.tags | safe }}
+		</div>
+	{% endif %}
+
+	<!-- Metrics: views, likes, shares, comments -->
+	<div class="metrics">
+		{% if fields.get("views") %}
+			<span class="views">
+				<i class="fa-solid fa-eye"></i> {{ fields.views | safe }}
+			</span>
+		{% endif %}
+		{% if fields.get("likes") %}
+			<span class="likes">
+				<i class="fa-solid fa-heart"></i> {{ fields.likes | safe }}
+			</span>
+		{% endif %}
+		{% if fields.get("shares") %}
+			<span class="shares">
+				<i class="fa-solid fa-share"></i> {{ fields.shares | safe }}
+			</span>
+		{% endif %}
+		{% if fields.get("comments") %}
+			<span class="comments">
+				<i class="fa-solid fa-comment"></i> {{ fields.comments | safe }}
+			</span>
+		{% endif %}
+	</div>
+
+</article>
diff --git a/webtool/templates/explorer/datasource_templates/instagram.html b/webtool/templates/explorer/datasource_templates/instagram.html
new file mode 100644
index 000000000..c7dbff47d
--- /dev/null
+++ b/webtool/templates/explorer/datasource_templates/instagram.html
@@ -0,0 +1,11 @@
+<header>
+	<img src="{{ post.get('author_avatar_url') }}">
+	<span class="author"><strong><a href="https://instagram.com/{{ post.get("author") }}">{{ post.get("author") }}</strong></a>
+		{% if post.get("is_verified") %} <i class="fa-solid fa-circle-check verified"></i>{% endif %}</span>
+	<span class="time">{{ post.get("timestamp") }}</span>
+</header>
+
+<article>
+	
+	<div class="body">{{ post.get("body") }}</div>
+</article>
\ No newline at end of file
diff --git a/webtool/templates/explorer/explorer.html b/webtool/templates/explorer/explorer.html
index 250d6bf03..28d4ec05c 100644
--- a/webtool/templates/explorer/explorer.html
+++ b/webtool/templates/explorer/explorer.html
@@ -24,15 +24,14 @@
 <!-- CSS: Use generic template, data source preset, or custom. -->
 <!-- Use a nested class so the style is only applied to posts. -->
 <style type="text/css">
-	.explorer-content-container {
-		{{ posts_css | safe }}
-	}
+.explorer-content-container {
+	{{ posts_css | safe }}
+}
 </style>
 
 <!-- Flags for chan posts -->
 <link rel="stylesheet" type="text/css" href="{{url_for('static', filename='css/flags.css')}}">
 
-
 <!-- The actual dataset posts -->
 <div class="explorer-content-container">
 	<div class="explorer-content">
diff --git a/webtool/templates/explorer/post-annotations.html b/webtool/templates/explorer/post-annotations.html
new file mode 100644
index 000000000..dd65f15a0
--- /dev/null
+++ b/webtool/templates/explorer/post-annotations.html
@@ -0,0 +1,51 @@
+<div class="post-annotations hidden">
+	
+	{% if annotation_fields %}
+		{% set old_annotations = None %}
+		{% if annotations and post.id in annotations %}
+			{% set old_annotations = annotations[post.id] %}
+		{% endif %}
+
+		{% for field in annotation_fields %}
+
+			{% set type = annotation_fields[field]["type"] %}
+			{% set label = annotation_fields[field]["label"] %}
+			{% set old_annotation = "" %}
+			{% if old_annotations and label in old_annotations %}
+				{% set old_annotation = old_annotations[label] %}
+			{% endif %}
+			
+			<div class="post-annotation field-{{ field }} {{ type }}"><label class="annotation-label">{{ label }}</label>
+			{% if type == 'text' %}
+				<input type="text" class="post-annotation-input text-{{ field }}" value="{{ old_annotation }}">
+
+			{% elif type == 'textarea' %}
+				<textarea class="post-annotation-input textarea-{{ field }}">{{ old_annotation }}</textarea>
+
+			{% elif type == 'dropdown' %}
+				<select class="post-annotation-options select-{{ type }}" id="{{ field }}">
+				<option class='post-annotation-input' value=''></option>
+
+				{% for option in annotation_fields[field]["options"] %}
+					{% set option_id = option.keys() | first %}
+					{% set option_label = option.values() | first %}
+					<option class="post-annotation-input option-{{ option_id }}" id="option-{{ option_id }}" value="{{ option_label }}" {% if option_label == old_annotation %}selected{% endif %}>{{ option_label }}</option>
+				{% endfor %}
+				</select>
+
+			{% elif type == 'checkbox' %}
+				<div class='post-annotation-options checkboxes-{{ field }}'>
+				{% for option in annotation_fields[field]["options"] %}
+					{% set option_id = option.keys() | first %}
+					{% set option_label = option.values() | first %}
+					{% set checked = "checked" if old_annotation and option_label in old_annotation else "" %}
+
+					<input type='checkbox' class='post-annotation-input option-{{ option_id }}' id='option-{{ post.id }}-{{ option_id }}' value='{{ option_label }}' {{ checked }}><label for='option-{{ post.id }}-{{ option_id }}'>{{ option_label }}</label>
+				{% endfor %}
+				</div>
+			{% endif %}
+
+			</div>
+		{% endfor %}
+	{% endif %}
+</div>
\ No newline at end of file
diff --git a/webtool/templates/explorer/post.html b/webtool/templates/explorer/post.html
index 208f9457d..557647f34 100644
--- a/webtool/templates/explorer/post.html
+++ b/webtool/templates/explorer/post.html
@@ -1,172 +1,19 @@
 <li id="post-{{ post.id }}" class="post{% if post.thread_id == post.id %} op{% endif %}">    
 
-	<!-- Special fields we're handling a bit differently -->
-	{% set special_fields = ["external_url", "author", "body", "timestamp", "image", "images", "sort_options", "markdown"] %}
+	<!-- Don't show certain data if the dataset is pseudonymised -->
 	{% set pseudonymised = True if parameters and ('pseudonymise' in parameters and parameters.get('pseudonymise')) else False %}
-
-	<!-- Possible external link if not pseudonymised -->
-	{% if custom_fields and "external_url" in custom_fields %}
-		{% set external_url = custom_fields.external_url | dict_field(post) %}
-		{% if external_url and pseudonymised %}
-			<span class="external-url deactivated" title="External URLs unavailable for pseudonymised datasets"><i class="fas fa-external-link-alt"></i></span>
-		{% elif external_url and not pseudonymised %}
-			<a href="{{ external_url }}" target="_blank"><span class="external-url" title="Go to original post"><i class="fas fa-external-link-alt"></i></span></a>
-		{% endif %}
-	{% endif %}
-
-	<header>
-		<!-- Metadata -->
-		{% if pseudonymised %}
-			<span title="Pseudonymous author" class="author">
-			<i class="fa fa-user-secret tooltip-trigger"></i>
-		{% else %}
-			<span title="Author" class="author">
-			{% if custom_fields and custom_fields.get("author") %}
-				{% set author = custom_fields.author | dict_field(post) | safe %}
-			{% else %}
-				{% set author = post.author %}
-			{% endif %}
-			{{ author }}
-		{% endif %}
-		</span>
-		{% if 'thread_id' in post %}
-			<span title="Thread ID" class="thread_id">{% if is_local %}<a href="/explorer/thread/{{ datasource }}/{{ board }}/{{ post.thread_id }}">{{ post.thread_id }}</a>{% else %}{{ post.thread_id }}{% endif %}</span>
-		{% endif %}
-		<!-- <span title="Post ID" class="post_id">{{ post.id }}</span> -->
-		{% if 'timestamp' in post %}
-			{% if post.timestamp is integer %}
-				<span title="Date" class="datetime">{{ post.timestamp|datetime('%Y-%m-%d %H:%M')|safe }}</span>
-			{% else %}
-				<span title="Date" class="datetime">{{ post.timestamp }}</span>
-			{% endif %}
-		{% endif %}
-
-		<!-- Custom metadata -->
-		{% if custom_fields %}
-			{% for custom_field in custom_fields %}
-
-				{% if custom_field not in special_fields %}
-					
-					{% set custom_value = custom_fields[custom_field] | dict_field(post) | safe %}
-					{% if custom_value and custom_value != "None" %}
-						{% if not ("author" in custom_field and pseudonymised) %}
-							<span title="{{ custom_field }}" class="{{ custom_field }}">{{ custom_value | safe }}</span>
-						{% endif %}
-					{% endif %}
-
-				{% endif %}
-			{% endfor %}
-		{% endif %}
-
-	</header>
-
-	<!-- Post content-->
-	<article>
-		<!-- Singular image -->
-		{% if custom_fields and 'image' in custom_fields %}
-			{% set img_link = custom_fields['image'] | dict_field(post) %}
-			<!-- We'll display a not-found image if 'not-found:' is in the image link -->
-			{% if 'not-found:' in img_link %}
-				<div class="post-image">
-					<img src="{{ url_for('static', filename='img/img-not-found.jpg') }}">
-				</div>
-			<!-- We link to an external source with a placeholder img if 'retrieve:' is in the image link -->
-			{% elif 'retrieve:' in img_link %}
-				{% set img_link = img_link[9:] %}
-				<a href="{{ img_link }}" target="_blank" rel="external">
-				<div class="post-image">
-					<img src="{{ url_for('static', filename='img/retrieve-img.jpg') }}">
-				</div>
-				</a>
-			<!-- Else just display the image -->
-			{% elif img_link %}
-				<a href="{{ img_link }}" target="_blank" rel="external">
-				<div class="post-image">
-					<img src="{{ img_link }}">
-				</div>
-				</a>
-			{% endif %}
+	
+	<!-- Data sources may have a custom template (Jinja2) -->
+	{% if template == "preset" %}
+		{% include "explorer/datasource_templates/" + datasource + ".html" %}
+
+	<!-- Else, use a default template that shows common fields -->
+	{% else %}
+		{% include "explorer/datasource_templates/default.html" %}
 		
-		<!-- Multiple images -->
-		{% elif custom_fields and 'images' in custom_fields %}
-			{% set img_links = custom_fields['images'] | dict_field(post) %}
-			{% if img_links %}
-				{% set img_links = img_links.split(",") %}
-				<div class="post-images">
-				{% for img_link in img_links %}
-				<a href="{{ img_link }}" target="_blank" rel="external">
-					<div class="post-image">
-						<img src="{{ img_link }}">
-					</div>
-				</a>
-				{% endfor %}
-				</div>
-			{% endif %}
-		{% endif %}
-
-	<span class="post-content">
-		{% if custom_fields and 'body' in custom_fields %}
-			{{ custom_fields.body | dict_field(post) | safe }}
-		{% else %}
-			{{ post.body | safe }}
-		{% endif %}
-	</span>
-
-	</article>
-
-	<!-- Annotations -->
-	<div class="post-annotations hidden">
-		
-		{% if annotation_fields %}
-			{% set old_annotations = None %}
-			{% if annotations and post.id in annotations %}
-				{% set old_annotations = annotations[post.id] %}
-			{% endif %}
-
-			{% for field in annotation_fields %}
-
-				{% set type = annotation_fields[field]["type"] %}
-				{% set label = annotation_fields[field]["label"] %}
-				{% set old_annotation = "" %}
-				{% if old_annotations and label in old_annotations %}
-					{% set old_annotation = old_annotations[label] %}
-				{% endif %}
-				
-				<div class="post-annotation field-{{ field }} {{ type }}"><label class="annotation-label">{{ label }}</label>
-				{% if type == 'text' %}
-				{{ old_annotation }}
-					<input type="text" class="post-annotation-input text-{{ field }}" value="{{ old_annotation }}">
-
-				{% elif type == 'textarea' %}
-					<textarea class="post-annotation-input textarea-{{ field }}">{{ old_annotation }}</textarea>
-
-				{% elif type == 'dropdown' %}
-					<select class="post-annotation-options select-{{ type }}" id="{{ field }}">
-					<option class='post-annotation-input' value=''></option>
-
-					{% for option in annotation_fields[field]["options"] %}
-						{% set option_id = option.keys() | first %}
-						{% set option_label = option.values() | first %}
-						<option class="post-annotation-input option-{{ option_id }}" id="option-{{ option_id }}" value="{{ option_label }}" {% if option_label == old_annotation %}selected{% endif %}>{{ option_label }}</option>
-					{% endfor %}
-					</select>
-
-				{% elif type == 'checkbox' %}
-					<div class='post-annotation-options checkboxes-{{ field }}'>
-						
-					{% for option in annotation_fields[field]["options"] %}
-						{% set option_id = option.keys() | first %}
-						{% set option_label = option.values() | first %}
-						{% set checked = "checked" if old_annotation and option_label in old_annotation else "" %}
-
-						<input type='checkbox' class='post-annotation-input option-{{ option_id }}' id='option-{{ post.id }}-{{ option_id }}' value='{{ option_label }}' {{ checked }}><label for='option-{{ post.id }}-{{ option_id }}'>{{ option_label }}</label>
-					{% endfor %}
-					</div>
-				{% endif %}
-
-				</div>
-			{% endfor %}
-		{% endif %}
-	</div>
+	{% endif %}
 
+	<!-- Annotations are always shown -->
+	{% include "explorer/post-annotations.html" %}
+	
 </li>
diff --git a/webtool/views/views_explorer.py b/webtool/views/views_explorer.py
index b8fd77ae6..4228c0aa7 100644
--- a/webtool/views/views_explorer.py
+++ b/webtool/views/views_explorer.py
@@ -56,8 +56,8 @@ def explorer_dataset(key, page=1):
 	datasource = parameters["datasource"]
 	post_count = int(dataset.data["num_rows"])
 	annotation_fields = dataset.get_annotation_fields()
-	print("AAAAAAAAA", annotation_fields)
 	datasource_config = config.get("explorer.config", {}).get(datasource,{})
+	warning = ""
 
 	# See if we can actually serve this page
 	if dataset.is_private and not (config.get("privileges.can_view_all_datasets") or dataset.is_accessible_by(current_user)):
@@ -73,7 +73,6 @@ def explorer_dataset(key, page=1):
 	if not config.get("explorer.config", {}).get(datasource,{}).get("enabled"):
 		return error(404, error="Explorer functionality disabled for %s." % datasource)
 
-
 	# The amount of posts to show on a page
 	posts_per_page = config.get("explorer.posts_per_page", 50)
 
@@ -128,23 +127,22 @@ def explorer_dataset(key, page=1):
 			if count >= (offset + posts_per_page) or count > max_posts:
 				break
 
-	# Include custom fields if it they are in the 'explorer' dir.
-	custom_fields = get_custom_fields(datasource)
+	if not posts:
+		return error(404, error="No posts available for this datasource")
+
+	# We can use either a generic or a pre-made template 
+	# for Explorer posts. Get the choice as set in the config,
+	# and if it's a preset data source template , verify it exists.
+	template = datasource_config.get("template", "general")
+	if template == "preset":
+		template_path = Path(config.get('PATH_ROOT'), "webtool/templates/explorer/datasource_templates/" + datasource + ".html")
+		if not template_path.exists():
+			template = "general"
+			warning += "No preset template found for this data source. Using the general template instead."
 
 	# Include CSS: a generic template, a data source preset, or custom.
 	posts_css = get_css(datasource, css_type=datasource_config.get("css", "general"))
 
-	# Convert posts from markdown to HTML
-	if custom_fields and "markdown" in custom_fields and custom_fields.get("markdown"):
-		posts = [convert_markdown(post) for post in posts]
-	# Clean up HTML
-	else:
-		posts = [strip_html(post) for post in posts]
-		posts = [format(post, datasource=datasource) for post in posts]
-
-	if not posts:
-		return error(404, error="No posts available for this datasource")
-
 	# Check whether there's already annotations inserted already.
 	# If so, also pass these to the template.
 	annotations = db.fetchone("SELECT * FROM annotations WHERE key = %s", (key,))
@@ -154,7 +152,7 @@ def explorer_dataset(key, page=1):
 		annotations = json.loads(annotations["annotations"])
 	
 	# Generate the HTML page
-	return render_template("explorer/explorer.html", dataset=dataset, datasource=datasource, has_database=has_database, parameters=parameters, posts=posts, annotation_fields=annotation_fields, annotations=annotations, datasource_config=datasource_config, posts_css=posts_css, custom_fields=custom_fields, page=page, offset=offset, posts_per_page=posts_per_page, post_count=post_count, max_posts=max_posts)
+	return render_template("explorer/explorer.html", dataset=dataset, datasource=datasource, has_database=has_database, posts=posts, annotation_fields=annotation_fields, annotations=annotations, posts_css=posts_css, template=template, page=page, offset=offset, posts_per_page=posts_per_page, post_count=post_count, max_posts=max_posts, warning=warning)
 
 @app.route('/results/<datasource>/<string:thread_id>/explorer')
 @api_ratelimit
@@ -557,50 +555,6 @@ def get_database_posts(db, datasource, ids, board="", threads=False, limit=0, of
 
 	return posts
 
-def get_custom_fields(datasource, filetype=None):
-	"""
-	Check if there are custom fields that need to be showed for this datasource.
-	If so, return a dictionary of those fields.
-	Custom field json files should be placed in an 'explorer' directory in the the datasource folder and named
-	'<datasourcename>-explorer.json' (e.g. 'reddit/explorer/reddit-explorer.json').
-	See https://github.com/digitalmethodsinitiative/4cat/wiki/Exploring-and-annotating-datasets for more information.
-
-	:param datasource, str: Datasource name
-	:param filetype, str:	The filetype that is handled. This can fluctuate
-							between e.g. NDJSON and csv files.
-
-	:return: Dictionary of custom fields that should be shown.
-	"""
-
-	# Set the directory name of this datasource.
-	if datasource == "twitter":
-		datasource_dir = "twitter-import"
-		datasource = "twitter-import"
-	else:
-		datasource_dir = datasource
-
-	json_path = Path(config.get('PATH_ROOT'), "webtool/static/js/explorer/", datasource + ".json")
-	read = False
-
-	if json_path.exists():
-		read = True
-	else:
-		# Allow both hypens and underscores in datasource name (to avoid some legacy issues)
-		json_path = re.sub(datasource, datasource.replace("-", "_"), str(json_path.absolute()))
-		if Path(json_path).exists():
-			read = True
-	
-	if read:
-		with open(json_path, "r", encoding="utf-8") as json_file:
-			try:
-				custom_fields = json.load(json_file)
-			except ValueError as e:
-				return ("invalid", e)
-	else:
-		custom_fields = None
-		
-	return custom_fields
-
 def get_css(datasource, css_type):
 	"""
 	Check if there's a custom css file for this dataset.
@@ -627,30 +581,4 @@ def get_css(datasource, css_type):
 	with open(css_path, "r", encoding="utf-8") as css:
 		css = css.read()
 
-	return css
-
-def get_nested_value(di, keys):
-	"""
-	Gets a nested value on the basis of a dictionary and a list of keys.
-	"""
-
-	for key in keys:
-		di = di.get(key)
-		if not di:
-			return 0
-	return di
-
-def strip_html(post):
-	post["body"] = strip_tags(post.get("body", ""))
-	return post
-
-def format(post, datasource=""):
-	if "chan" in datasource or datasource == "8kun":
-		post["body"] = format_chan_post(post.get("body", ""))
-	post["body"] = post.get("body", "").replace("\n", "<br>")
-	return post
-
-def convert_markdown(post):
-	post["body"] = post.get("body", "").replace("\n", "\n\n").replace("&gt;", ">").replace("] (", "](")
-	post["body"] = markdown2.markdown(post.get("body", ""), extras=["nofollow","target-blank-links"])
-	return post
+	return css
\ No newline at end of file

From 93460baef6f4bf96950272ebe217bc00fdc9ffe6 Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Mon, 22 Apr 2024 23:08:20 +0200
Subject: [PATCH 021/204] Improve Instagram template, add location fields to
 Instagram search

---
 datasources/instagram/search_instagram.py     |  8 ++-
 webtool/lib/template_filters.py               |  6 +-
 webtool/static/css/explorer/default.css       |  4 +-
 webtool/static/css/explorer/instagram.css     | 71 +++++++++++++++++--
 .../datasource_templates/default.html         |  8 +--
 .../datasource_templates/instagram.html       | 37 ++++++++--
 6 files changed, 111 insertions(+), 23 deletions(-)

diff --git a/datasources/instagram/search_instagram.py b/datasources/instagram/search_instagram.py
index d086e73ab..cd57862b3 100644
--- a/datasources/instagram/search_instagram.py
+++ b/datasources/instagram/search_instagram.py
@@ -114,10 +114,11 @@ def parse_graph_item(node):
             media_types = set([s["node"]["__typename"] for s in node["edge_sidecar_to_children"]["edges"]])
             media_type = "mixed" if len(media_types) > 1 else type_map.get(media_types.pop(), "unknown")
 
-        location = {"name": MissingMappedField(""), "latlong": MissingMappedField(""), "city": MissingMappedField("")}
+        location = {"name": MissingMappedField(""), "location_id": MissingMappedField(""), "latlong": MissingMappedField(""), "city": MissingMappedField("")}
         # location has 'id', 'has_public_page', 'name', and 'slug' keys in tested examples; no lat long or "city" though name seems
         if node.get("location"):
             location["name"] = node["location"].get("name")
+            location["location_id"] = node["location"].get("pk")
             # Leaving this though it does not appear to be used in this type; maybe we'll be surprised in the future...
             location["latlong"] = str(node["location"]["lat"]) + "," + str(node["location"]["lng"]) if node[
                 "location"].get("lat") else ""
@@ -151,6 +152,7 @@ def parse_graph_item(node):
             "num_comments": node.get("edge_media_preview_comment", {}).get("count", 0),
             "num_media": num_media,
             "location_name": location["name"],
+            "location_id": location["location_id"],
             "location_latlong": location["latlong"],
             "location_city": location["city"],
             "unix_timestamp": node["taken_at_timestamp"]
@@ -207,9 +209,10 @@ def parse_itemlist_item(node):
         else:
             num_comments = -1
 
-        location = {"name": MissingMappedField(""), "latlong": MissingMappedField(""), "city": MissingMappedField("")}
+        location = {"name": MissingMappedField(""), "location_id": MissingMappedField(""), "latlong": MissingMappedField(""), "city": MissingMappedField("")}
         if node.get("location"):
             location["name"] = node["location"].get("name")
+            location["location_id"] = node["location"].get("pk")
             location["latlong"] = str(node["location"]["lat"]) + "," + str(node["location"]["lng"]) if node[
                 "location"].get("lat") else ""
             location["city"] = node["location"].get("city")
@@ -242,6 +245,7 @@ def parse_itemlist_item(node):
             "num_comments": num_comments,
             "num_media": num_media,
             "location_name": location["name"],
+            "location_id": location["location_id"],
             "location_latlong": location["latlong"],
             "location_city": location["city"],
             "unix_timestamp": node["taken_at"]
diff --git a/webtool/lib/template_filters.py b/webtool/lib/template_filters.py
index 1e8ae889c..fe3d3208e 100644
--- a/webtool/lib/template_filters.py
+++ b/webtool/lib/template_filters.py
@@ -175,6 +175,11 @@ def _jinja2_filter_extension_to_noun(ext):
 	else:
 		return "item"
 
+
+@app.template_filter('post_media')
+def _jinja2_filter_post_media(url):
+	return url
+
 @app.template_filter('4chan_image')
 def _jinja2_filter_4chan_image(image_4chan, post_id, board, image_md5):
 
@@ -228,7 +233,6 @@ def _jinja2_filter_4chan_image(image_4chan, post_id, board, image_md5):
 	# Archivedmoe as a last resort - has a lot of boards
 	return "retrieve:https://archived.moe/_/search/image/" + image_md5
 
-
 @app.template_filter('parameter_str')
 def _jinja2_filter_parameter_str(url):
 	# Returns the current URL parameters as a valid string.
diff --git a/webtool/static/css/explorer/default.css b/webtool/static/css/explorer/default.css
index 02626b458..196502cbb 100644
--- a/webtool/static/css/explorer/default.css
+++ b/webtool/static/css/explorer/default.css
@@ -87,6 +87,7 @@ See https://github.com/digitalmethodsinitiative/4cat/wiki/Exploring-and-annotati
     color: #215bc6;
 }
 
+
 .posts .external-url {
     position: absolute;
     bottom: 0;
@@ -143,7 +144,4 @@ span.divider {
 
 .post-annotation-options > input {
     display: inline-block;
-}
-
-.posts .external-url {
 }
\ No newline at end of file
diff --git a/webtool/static/css/explorer/instagram.css b/webtool/static/css/explorer/instagram.css
index 86d9bd990..27623f0ab 100644
--- a/webtool/static/css/explorer/instagram.css
+++ b/webtool/static/css/explorer/instagram.css
@@ -5,13 +5,13 @@
 .posts li.post {
 	margin: 0 auto;
 	max-width: 500px;
+	position: relative;
 	background-color: white;
 	font-family: "Segoe UI", Roboto, Helvetica, Arial, sans-serif;
 	font-size: 14px;
-	border-bottom: 1px solid grey;
+	border-bottom: 1px solid #e3e3e3;
 	list-style-type: none;
-	padding-top: 30px;
-	padding-bottom: 30px;
+	padding: 0;
 }
 
 .posts header {
@@ -27,12 +27,35 @@
 	color: #0095f6;
 }
 
-.time {
-	color: #7b7b7b;
+.posts header .location {
+	font-size: 12px;
+}
+
+.posts article {
+	min-width: auto;
+}
+
+.posts .media-container {
+    width: 100%;
+}
+
+.posts .media-container > img {
+    width: 100%;
+}
+
+.posts header, .posts .post-content {
+	padding: 20px;
 }
 
-.posts .alt, .posts .alt time {
-	color: grey;
+.posts .external-url {
+    position: absolute;
+    bottom: 0;
+    right: 0;
+    padding: 10px;
+}
+
+.time, .comments a {
+	color: #7b7b7b;
 }
 
 .posts .post-image {
@@ -40,4 +63,38 @@
 	margin: 0 auto;
 	margin-top: 30px;
 	margin-bottom: 30px;
+}
+
+/** --------------------- *
+     Annotation post elements
+  * --------------------- */
+.post-annotations {
+	border-top: 1px solid #e3e3e3;
+}
+
+.post-annotation {
+    padding: 15px;
+}
+
+.post-annotation > .annotation-label {
+    display: inline-block;
+    vertical-align: middle;
+    text-align: right;
+    min-width: 150px;
+    margin-right: 5px;
+    line-height: 1.6em;
+    overflow-x: hidden;
+}
+
+.post-annotation.checkbox > .post-annotation-options {
+    display: inline-block;
+}
+
+.post-annotation-options {
+    display: inline-block;
+    vertical-align: top;
+}
+
+.post-annotation-options > input {
+    display: inline-block;
 }
\ No newline at end of file
diff --git a/webtool/templates/explorer/datasource_templates/default.html b/webtool/templates/explorer/datasource_templates/default.html
index dcd79cb2d..f6ee0b2ad 100644
--- a/webtool/templates/explorer/datasource_templates/default.html
+++ b/webtool/templates/explorer/datasource_templates/default.html
@@ -102,22 +102,22 @@
 	<div class="metrics">
 		{% if fields.get("views") %}
 			<span class="views">
-				<i class="fa-solid fa-eye"></i> {{ fields.views | safe }}
+				<i class="fa-solid fa-eye"></i> {{ fields.views | commafy }}
 			</span>
 		{% endif %}
 		{% if fields.get("likes") %}
 			<span class="likes">
-				<i class="fa-solid fa-heart"></i> {{ fields.likes | safe }}
+				<i class="fa-solid fa-heart"></i> {{ fields.likes | commafy }}
 			</span>
 		{% endif %}
 		{% if fields.get("shares") %}
 			<span class="shares">
-				<i class="fa-solid fa-share"></i> {{ fields.shares | safe }}
+				<i class="fa-solid fa-share"></i> {{ fields.shares | commafy }}
 			</span>
 		{% endif %}
 		{% if fields.get("comments") %}
 			<span class="comments">
-				<i class="fa-solid fa-comment"></i> {{ fields.comments | safe }}
+				<i class="fa-solid fa-comment"></i> {{ fields.comments | commafy }}
 			</span>
 		{% endif %}
 	</div>
diff --git a/webtool/templates/explorer/datasource_templates/instagram.html b/webtool/templates/explorer/datasource_templates/instagram.html
index c7dbff47d..09f9677e2 100644
--- a/webtool/templates/explorer/datasource_templates/instagram.html
+++ b/webtool/templates/explorer/datasource_templates/instagram.html
@@ -1,11 +1,36 @@
 <header>
-	<img src="{{ post.get('author_avatar_url') }}">
-	<span class="author"><strong><a href="https://instagram.com/{{ post.get("author") }}">{{ post.get("author") }}</strong></a>
-		{% if post.get("is_verified") %} <i class="fa-solid fa-circle-check verified"></i>{% endif %}</span>
-	<span class="time">{{ post.get("timestamp") }}</span>
+
+	<!-- Possible external link, if not pseudonymised -->
+	{% if not pseudonymised %}
+		<a href="{{ post.url }}" target="_blank"><span class="external-url" title="Go to original post"><i class="fas fa-external-link-alt"></i></span></a>
+		<span class="author"><strong><a href="https://instagram.com/{{ post.get('author') }}">{{ post.get("author") }}</strong></a>
+			{% if post.get("is_verified") %} <i class="fa-solid fa-circle-check verified"></i>{% endif %}</span>
+		<span class="time">{{ post.get("timestamp") }}</span>
+	{% else %}
+		<span title="Pseudonymous author" class="author">
+		<i class="fa fa-user-secret tooltip-trigger"></i>
+	{% endif %}
+	{% if post.get("location_name") %}
+	<div class="location">
+		<a href="https://instagram.com/explore/locations/{{ post.get('location_id') }}/{{ post.get('location_name')}}" target="_blank">{{ post.location_name }}</a>
+	</div>
+	{% endif %}
+
 </header>
 
-<article>
-	
+<div class="media-container">
+	<img src="{{ post.get('media_url') | post_media }}">
+</div>
+<article class="post-content">
+	{% if post.num_likes %} 
+	<div class="likes"><strong>{{ post.get("num_likes") | commafy }} likes</strong></div>
+	{% endif %}
 	<div class="body">{{ post.get("body") }}</div>
+	{% if post.num_comments %}
+		{% if pseudonymised %}
+		<div class="comments">{{ post.get("num_comments") | commafy }} comments</span>
+		{% else %}
+		<div class="comments"><a href="{{ post.url }}/comments" target="_blank">View all {{ post.get("num_comments") | commafy }} comments</a></span>
+		{% endif %}
+	{% endif %}
 </article>
\ No newline at end of file

From 6929986424b9dae94031b066b530a70d254b8d71 Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Tue, 23 Apr 2024 16:48:32 +0200
Subject: [PATCH 022/204] Simplify template settings, add Twitter and Instagram
 template

---
 backend/lib/processor.py                      |   2 +-
 common/lib/config_definition.py               |  47 ++----
 .../dmi-tcat/explorer/dmi-tcat-explorer.json  |  92 -----------
 .../explorer/dmi-tcat-explorer.json           |  92 -----------
 .../douyin/explorer/douyin-explorer.json      |  44 ------
 .../fourchan/explorer/fourchan-explorer.json  |  33 ----
 .../explorer/instagram-explorer.json          |  33 ----
 .../reddit/explorer/reddit-explorer.json      |  41 -----
 .../tiktok/explorer/tiktok-explorer.json      |  45 ------
 .../explorer/tiktok_urls-explorer.json        |  26 ----
 .../tumblr/explorer/tumblr-explorer.json      |  28 ----
 .../explorer/twitter-import-explorer.json     | 140 -----------------
 datasources/twitter-import/search_twitter.py  |   2 +
 .../explorer/twitterv2-explorer.json          |  94 -----------
 processors/filtering/write_annotations.py     |   5 +-
 webtool/lib/template_filters.py               |   7 +-
 webtool/static/css/explorer/generic.css       | 147 ++++++++++++++++++
 webtool/static/css/explorer/instagram.css     |  13 +-
 webtool/static/css/explorer/twitter.css       | 131 ++++++++++++++++
 webtool/static/img/go-to-media.jpg            | Bin 0 -> 18550 bytes
 .../explorer/annotations-editor.html          |   4 +-
 .../datasource-templates/generic.html         | 127 +++++++++++++++
 .../datasource-templates/instagram.html       |  43 +++++
 .../datasource-templates/twitter.html         |  53 +++++++
 webtool/templates/explorer/explorer.html      |   2 +-
 webtool/templates/explorer/post.html          |  10 +-
 webtool/views/views_explorer.py               |  70 +++------
 27 files changed, 562 insertions(+), 769 deletions(-)
 delete mode 100644 datasources/dmi-tcat/explorer/dmi-tcat-explorer.json
 delete mode 100644 datasources/dmi-tcatv2/explorer/dmi-tcat-explorer.json
 delete mode 100644 datasources/douyin/explorer/douyin-explorer.json
 delete mode 100644 datasources/fourchan/explorer/fourchan-explorer.json
 delete mode 100644 datasources/instagram/explorer/instagram-explorer.json
 delete mode 100644 datasources/reddit/explorer/reddit-explorer.json
 delete mode 100644 datasources/tiktok/explorer/tiktok-explorer.json
 delete mode 100644 datasources/tiktok_urls/explorer/tiktok_urls-explorer.json
 delete mode 100644 datasources/tumblr/explorer/tumblr-explorer.json
 delete mode 100644 datasources/twitter-import/explorer/twitter-import-explorer.json
 delete mode 100644 datasources/twitterv2/explorer/twitterv2-explorer.json
 create mode 100644 webtool/static/css/explorer/generic.css
 create mode 100644 webtool/static/css/explorer/twitter.css
 create mode 100644 webtool/static/img/go-to-media.jpg
 create mode 100644 webtool/templates/explorer/datasource-templates/generic.html
 create mode 100644 webtool/templates/explorer/datasource-templates/instagram.html
 create mode 100644 webtool/templates/explorer/datasource-templates/twitter.html

diff --git a/backend/lib/processor.py b/backend/lib/processor.py
index 24b7b4a11..564351aa6 100644
--- a/backend/lib/processor.py
+++ b/backend/lib/processor.py
@@ -418,7 +418,7 @@ def add_field_to_parent(self, field_name, new_data, which_parent=source_dataset,
 		parent_path = which_parent.get_results_path()
 
 		if len(new_data) != which_parent.num_rows:
-			raise ProcessorException('Must have new data point for each record: parent dataset: %i, new data points: %i' % (which_parent.num_rows, len(new_data)))
+			self.dataset.update_status('The amount of new data points and existing records don\'t match; data may be misaligned (parent dataset: %i, new data points: %i)' % (which_parent.num_rows, len(new_data)))
 
 		self.dataset.update_status("Adding new field %s to the source file" % field_name)
 
diff --git a/common/lib/config_definition.py b/common/lib/config_definition.py
index 4e9b5a21a..4e26d22e0 100644
--- a/common/lib/config_definition.py
+++ b/common/lib/config_definition.py
@@ -328,52 +328,23 @@
     },
     "explorer._config_explanation": {
         "type": UserInput.OPTION_INFO,
-        "help": "Per data source, you can enable/disable the Explorer and customise how posts appear. "
-                "The latter involves *what fields to show* and *how posts are styled*. Many data sources have tailored "
-                "[presets for this](https://github.com/digitalmethodsinitiative/4cat/tree/master/webtool/static/"
-                "explorer-presets). If presets are unavailable, a general template for [fields](https://github.com/"
-                "digitalmethodsinitiative/4cat/tree/master/webtool/static/explorer-presets/default-fields.json) "
-                "and [CSS styling](https://github.com/digitalmethodsinitiative/4cat/tree/master/webtool/static/"
-                "explorer-presets/default-css.css) is used. You can also toggle between data source presets and the general "
-                "template via the table below."
-    },
-    # "explorer._config_explanation2": {
-    #     "type": UserInput.OPTION_INFO,
-    #     "help": "Alternatively, you can also *customise fields and CSS yourself* by choosing the `Custom` setting in the "
-    #             "table below and inserting JSON and CSS values in the text boxes underneath. See the [wiki for instructions "
-    #             "on how to format custom fields and CSS](https://github.com/digitalmethodsinitiative/4cat/wiki/"
-    #             "Exploring-and-annotating-datasets#add-custom-fields)."
-    # },
+        "help": "Per data source, you can enable or disable the Explorer. Posts will be formatted through a <em>generic</em> template "
+                "made of [this HTML file](https://github.com/digitalmethodsinitiative/4cat/tree/master/webtool/templates/explorer/"
+                "templates/generic.html) and [this CSS file](https://github.com/digitalmethodsinitiative/4cat/tree/master/webtool/"
+                "static/css/explorer/generic.css). For various data sources, <em>data source-specific</em> templates are also available. "
+                "These are made of a custom HTML template in [this directory](https://github.com/digitalmethodsinitiative/4cat/tree/master/"
+                "webtool/datasource-templates/explorer/templates) and a custom CSS file [in this directory](https://github.com/digitalmethodsinitiative/4cat/tree/master/webtool/static/css/explorer)."
+    },
     "explorer.config": {
         "type": UserInput.OPTION_DATASOURCES_TABLE,
         "help": "Explorer settings per data source",
-        "default": {"fourchan": {"enabled": True, "css": "preset", "template": "general"}, "eightchan": {"enabled": True, "css": "general", "template": "general"}, "eightkun": {"enabled": True, "css": "general", "template": "general"}, "ninegag": {"enabled": True, "css": "general", "template": "general"}, "bitchute": {"enabled": True, "css": "general", "template": "general"}, "dmi-tcat": {"enabled": True, "css": "general", "template": "general"}, "dmi-tcatv2": {"enabled": True, "css": "general", "template": "general"}, "douban": {"enabled": True, "css": "general", "template": "general"}, "douyin": {"enabled": False, "css": "general", "template": "general"}, "imgur": {"enabled": True, "css": "general", "template": "general"}, "upload": {"enabled": True, "css": "general", "template": "general"}, "instagram": {"enabled": True, "css": "preset", "template": "preset"}, "linkedin": {"enabled": True, "css": "general", "template": "general"}, "parler": {"enabled": True, "css": "general", "template": "general"}, "reddit": {"enabled": True, "css": "preset", "template": "preset"}, "telegram": {"enabled": True, "css": "general", "template": "general"}, "tiktok": {"enabled": True, "css": "preset", "template": "preset"}, "tiktok-urls": {"enabled": True, "css": "preset", "template": "preset"}, "tumblr": {"enabled": True, "css": "preset", "template": "preset"}, "twitter": {"enabled": True, "css": "preset", "template": "preset"}, "twitterv2": {"enabled": True, "css": "preset", "template": "preset"}, "usenet": {"enabled": True, "css": "general", "template": "general"}, "vk": {"enabled": True, "css": "general", "template": "general"}},
+        "default": {"fourchan": {"enabled": True}, "eightchan": {"enabled": True}, "eightkun": {"enabled": True}, "ninegag": {"enabled": True}, "bitchute": {"enabled": True}, "dmi-tcat": {"enabled": True}, "dmi-tcatv2": {"enabled": True}, "douban": {"enabled": True}, "douyin": {"enabled": False}, "imgur": {"enabled": True}, "upload": {"enabled": True}, "instagram": {"enabled": True}, "linkedin": {"enabled": True}, "parler": {"enabled": True}, "reddit": {"enabled": True}, "telegram": {"enabled": True}, "tiktok": {"enabled": True}, "tiktok-urls": {"enabled": True}, "tumblr": {"enabled": True}, "twitter": {"enabled": True}, "twitterv2": {"enabled": True}, "usenet": {"enabled": True}, "vk": {"enabled": True}},
         "columns": {
             "enabled": {
                 "type": UserInput.OPTION_TOGGLE,
-                "help": "Enable",
+                "help": "Enable Explorer",
                 "tooltip": "Whether the Explorer is available for this data source",
                 "default": True
-            },
-            "template": {
-                "type": UserInput.OPTION_CHOICE,
-                "help": "Template",
-                "options": {
-                    "general": "General",
-                    "preset": "Preset"
-                },
-                "default": "general",
-                "tooltip": "What template to use (see explanation above)"
-            },
-            "css": {
-                "type": UserInput.OPTION_CHOICE,
-                "help": "CSS",
-                "options": {
-                    "general": "General",
-                    "preset": "Preset"
-                },
-                "default": "general",
-                "tooltip": "What CSS styling to use (see explanation above)"
             }
         }
     },
diff --git a/datasources/dmi-tcat/explorer/dmi-tcat-explorer.json b/datasources/dmi-tcat/explorer/dmi-tcat-explorer.json
deleted file mode 100644
index 017323e88..000000000
--- a/datasources/dmi-tcat/explorer/dmi-tcat-explorer.json
+++ /dev/null
@@ -1,92 +0,0 @@
-{
-	"ndjson": {
-		"profile_picture": "<img src='{{ author_user.profile_image_url }}'>",
-		"author": "{{ author_user.name }}",
-		"created": "{{ created_at }}",
-		"likes": "{{ public_metrics.like_count }} <i class='fas fa-heart'></i>",
-		"retweets": "{{ public_metrics.retweet_count }} <i class='fas fa-retweet'></i>",
-		"replies": "{{ public_metrics.reply_count }} <i class='fas fa-reply'></i>",
-		"external_url": "https://twitter.com/{{ author_user.username }}/status/{{ id }}",
-		"image": "{{ attachments.media_keys.url }}",
-		"body": "{{ text }}",
-		"sort_options": [
-			{
-				"key": "created_at",
-				"label": "Old to new"
-			},
-			{
-				"key": "created_at",
-				"label": "New to old",
-				"descending": true
-			},
-			{
-				"key": "id",
-				"label": "Post id",
-				"force_int": true
-			},
-			{
-				"key": "public_metrics.like_count",
-				"label": "Most likes",
-				"descending": true,
-				"force_int": true
-			},
-			{
-				"key": "public_metrics.retweet_count",
-				"label": "Most retweets",
-				"descending": true,
-				"force_int": true
-			},
-			{
-				"key": "public_metrics.reply_count",
-				"label": "Most replies",
-				"descending": true,
-				"force_int": true
-			}
-		]		
-	},
-	"csv": {
-		"likes": "{{ like_count }} <i class='fas fa-heart'></i>",
-		"retweets": "{{ retweet_count }} <i class='fas fa-retweet'></i>",
-		"replies": "{{ reply_count }} <i class='fas fa-reply'></i>",
-		"external_url": "https://twitter.com/{{ author }}/status/{{ id }}",
-		"images": "{{ images }}",
-		"body": "{{ body }}",
-		"sort_options": [
-			{
-				"key": "unix_timestamp",
-				"label": "Old to new",
-				"force_int": true
-			},
-			{
-				"key": "unix_timestamp",
-				"label": "New to old",
-				"descending": true,
-				"force_int": true
-			},
-			{
-				"key": "id",
-				"label": "Post id",
-				"force_int": true,
-				"force_int": true
-			},
-			{
-				"key": "like_count",
-				"label": "Most likes",
-				"descending": true,
-				"force_int": true
-			},
-			{
-				"key": "retweet_count",
-				"label": "Most retweets",
-				"descending": true,
-				"force_int": true
-			},
-			{
-				"key": "reply_count",
-				"label": "Most replies",
-				"descending": true,
-				"force_int": true
-			}
-		]		
-	}
-}
\ No newline at end of file
diff --git a/datasources/dmi-tcatv2/explorer/dmi-tcat-explorer.json b/datasources/dmi-tcatv2/explorer/dmi-tcat-explorer.json
deleted file mode 100644
index 017323e88..000000000
--- a/datasources/dmi-tcatv2/explorer/dmi-tcat-explorer.json
+++ /dev/null
@@ -1,92 +0,0 @@
-{
-	"ndjson": {
-		"profile_picture": "<img src='{{ author_user.profile_image_url }}'>",
-		"author": "{{ author_user.name }}",
-		"created": "{{ created_at }}",
-		"likes": "{{ public_metrics.like_count }} <i class='fas fa-heart'></i>",
-		"retweets": "{{ public_metrics.retweet_count }} <i class='fas fa-retweet'></i>",
-		"replies": "{{ public_metrics.reply_count }} <i class='fas fa-reply'></i>",
-		"external_url": "https://twitter.com/{{ author_user.username }}/status/{{ id }}",
-		"image": "{{ attachments.media_keys.url }}",
-		"body": "{{ text }}",
-		"sort_options": [
-			{
-				"key": "created_at",
-				"label": "Old to new"
-			},
-			{
-				"key": "created_at",
-				"label": "New to old",
-				"descending": true
-			},
-			{
-				"key": "id",
-				"label": "Post id",
-				"force_int": true
-			},
-			{
-				"key": "public_metrics.like_count",
-				"label": "Most likes",
-				"descending": true,
-				"force_int": true
-			},
-			{
-				"key": "public_metrics.retweet_count",
-				"label": "Most retweets",
-				"descending": true,
-				"force_int": true
-			},
-			{
-				"key": "public_metrics.reply_count",
-				"label": "Most replies",
-				"descending": true,
-				"force_int": true
-			}
-		]		
-	},
-	"csv": {
-		"likes": "{{ like_count }} <i class='fas fa-heart'></i>",
-		"retweets": "{{ retweet_count }} <i class='fas fa-retweet'></i>",
-		"replies": "{{ reply_count }} <i class='fas fa-reply'></i>",
-		"external_url": "https://twitter.com/{{ author }}/status/{{ id }}",
-		"images": "{{ images }}",
-		"body": "{{ body }}",
-		"sort_options": [
-			{
-				"key": "unix_timestamp",
-				"label": "Old to new",
-				"force_int": true
-			},
-			{
-				"key": "unix_timestamp",
-				"label": "New to old",
-				"descending": true,
-				"force_int": true
-			},
-			{
-				"key": "id",
-				"label": "Post id",
-				"force_int": true,
-				"force_int": true
-			},
-			{
-				"key": "like_count",
-				"label": "Most likes",
-				"descending": true,
-				"force_int": true
-			},
-			{
-				"key": "retweet_count",
-				"label": "Most retweets",
-				"descending": true,
-				"force_int": true
-			},
-			{
-				"key": "reply_count",
-				"label": "Most replies",
-				"descending": true,
-				"force_int": true
-			}
-		]		
-	}
-}
\ No newline at end of file
diff --git a/datasources/douyin/explorer/douyin-explorer.json b/datasources/douyin/explorer/douyin-explorer.json
deleted file mode 100644
index 3735aa0bf..000000000
--- a/datasources/douyin/explorer/douyin-explorer.json
+++ /dev/null
@@ -1,44 +0,0 @@
-{
-	"ndjson": {
-		"author": "{{ author.nickname }}",
-		"created": "{{ create_time | datetime }}",
-		"body": "<a href='{{ share_url }}' target='__blank'><img class='preview' src='{{ video.cover.url_list[0] }}'></a><br>{{ desc }}",
-		"external_url": "{{ share_url }}",
-		"plays": "<i class='fa-solid fa-play'></i> {{ statistics.play_count | numberify }}",
-		"likes": "<i class='fa-solid fa-heart'></i> {{ statistics.digg_count | numberify }}",
-		"comments": "<i class='fa-solid fa-comment'></i> {{ statistics.comment_count | numberify }}",
-		"shares": "<i class='fa-solid fa-share'></i> {{ statistics.share_count | numberify }}",
-		
-		"sort_options": [
-			{
-				"key": "create_time",
-				"label": "Old to new"
-			},
-			{
-				"key": "create_time",
-				"label": "New to old",
-				"descending": true
-			},
-			{
-				"key": "statistics.play_count",
-				"label": "Plays",
-				"descending": true
-			},
-			{
-				"key": "statistics.digg_count",
-				"label": "Likes",
-				"descending": true
-			},
-			{
-				"key": "statistics.comment_count",
-				"label": "Comments",
-				"descending": true
-			},
-			{
-				"key": "statistics.share_count",
-				"label": "Shares",
-				"descending": true
-			}
-		]		
-	}
-}
\ No newline at end of file
diff --git a/datasources/fourchan/explorer/fourchan-explorer.json b/datasources/fourchan/explorer/fourchan-explorer.json
deleted file mode 100644
index 9ba81e454..000000000
--- a/datasources/fourchan/explorer/fourchan-explorer.json
+++ /dev/null
@@ -1,33 +0,0 @@
-{
-	"subject": "{{ subject }}",
-	"board": "/{{ board }}/",
-	"image": "{{ image_4chan | 4chan_image(id, board, image_md5) }}",
-	"country": "<span class='flag flag-{{ country_code | lower }}' title='{{ country_name }}'></span>",
-	"deleted": "<span class='deleted' title='This post was deleted by a moderator on {{ timestamp_deleted }}'><i class='fa fa-trash'></i></span>",
-	"external_url": "https://archive.4plebs.org/{{ board }}/thread/{{ thread_id }}#{{ id }}",
-	"sort_options": [
-		{
-			"key": "timestamp",
-			"label": "Old to new"
-		},
-		{
-			"key": "timestamp",
-			"label": "New to old",
-			"descending": true
-		},
-		{
-			"key": "id",
-			"label": "Post id",
-			"force_int": true
-		},
-		{
-			"key": "thread_id",
-			"label": "Thread id",
-			"force_int": true
-		},
-		{
-			"key": "country_code",
-			"label": "Country"
-		}
-	]
-}
\ No newline at end of file
diff --git a/datasources/instagram/explorer/instagram-explorer.json b/datasources/instagram/explorer/instagram-explorer.json
deleted file mode 100644
index 9e5935297..000000000
--- a/datasources/instagram/explorer/instagram-explorer.json
+++ /dev/null
@@ -1,33 +0,0 @@
-{
-	"ndjson": {
-		"author": "{{ user.full_name }}",
-		"body": "{{ caption.text }}",
-		"image": "retrieve:{{ image_versions2.candidates.url }}",
-		"likes": "<b>{{ like_count }} likes</b>",
-		"comments": "<span class='alt'><a href='https://www.instagram.com/p/{{ code }}'>{{ comment_count }} comments</a></span>",
-		"date": "<span class='alt'>{{ taken_at | datetime }}</span>",
-		"external_url": "https://instagram.com/p/{{ code }}",
-		"type": "<span class='alt'>{{ product_type }}</a>",
-		"sort_options": [
-				{
-					"key": "taken_at",
-					"label": "Old to new"
-				},
-				{
-					"key": "taken_at",
-					"label": "New to old",
-					"descending": true
-				},
-				{
-					"key": "like_count",
-					"label": "Likes",
-					"descending": true
-				},
-				{
-					"key": "stats.commentCount",
-					"label": "Comments",
-					"descending": true
-				}
-			]
-	}
-}
\ No newline at end of file
diff --git a/datasources/reddit/explorer/reddit-explorer.json b/datasources/reddit/explorer/reddit-explorer.json
deleted file mode 100644
index b70c85065..000000000
--- a/datasources/reddit/explorer/reddit-explorer.json
+++ /dev/null
@@ -1,41 +0,0 @@
-{
-	"post_flair": "<span class='flair'>{{ post_flair }}</span>",
-	"author": "{{ author }} <span class='flair'>{{author_flair}}</span>",
-	"subreddit": "<a href='https://reddit.com/r/{{subreddit}}' target='__blank'>r/{{subreddit}}</a>",
-	"score": "<i class='fas fa-arrow-up'></i> {{score}} <i class='fas fa-arrow-down'></i>",
-	"external_url": "https://reddit.com/r/{{subreddit}}/comments/{{thread_id}}/comment/{{id}}",
-	"image": "{{ image_file }}",
-	"subject": "{{ subject }}",
-	"subject_url": "<a href='{{ url }}'>{{ domain }}</a>",
-	"sort_options": [
-		{
-			"key": "timestamp",
-			"label": "Old to new"
-		},
-		{
-			"key": "timestamp",
-			"label": "New to old",
-			"descending": true
-		},
-		{
-			"key": "id",
-			"label": "Post id"
-		},
-		{
-			"key": "thread_id",
-			"label": "Thread id"
-		},
-		{
-			"key": "score",
-			"label": "Score (highest to lowest)",
-			"descending": true,
-			"force_int": true
-		},
-		{
-			"key": "score",
-			"label": "Score (lowest to highest)",
-			"force_int": true
-		}
-	],
-	"markdown": true
-}
\ No newline at end of file
diff --git a/datasources/tiktok/explorer/tiktok-explorer.json b/datasources/tiktok/explorer/tiktok-explorer.json
deleted file mode 100644
index 92724261f..000000000
--- a/datasources/tiktok/explorer/tiktok-explorer.json
+++ /dev/null
@@ -1,45 +0,0 @@
-{
-	"ndjson": {
-		"tiktok-user": "<a href='https://www.tiktok.com/@{{ author.uniqueId }}' target='__blank'>@{{ author.uniqueId }}</a>",
-		"external_url": "https://www.tiktok.com/@{{ author.uniqueId }}/video/{{ id }}",
-		"body": "<a href='https://www.tiktok.com/@{{ author.uniqueId }}/video/{{ id }}' target='_blank'><img class='preview' src='{{ video.cover }}'></a><br>{{ desc }}",
-		"author": "{{ nickname }}",
-		"created": "{{ createTime | datetime }}",
-		"plays": "<i class='fa-solid fa-play'></i> {{ stats.playCount | numberify }}",
-		"likes": "<i class='fa-solid fa-heart'></i> {{ stats.diggCount | numberify }}",
-		"comments": "<i class='fa-solid fa-comment'></i> {{ stats.commentCount | numberify }}",
-		"shares": "<i class='fa-solid fa-share'></i> {{ stats.shareCount | numberify }}",
-		
-		"sort_options": [
-			{
-				"key": "createTime",
-				"label": "Old to new"
-			},
-			{
-				"key": "createTime",
-				"label": "New to old",
-				"descending": true
-			},
-			{
-				"key": "stats.playCount",
-				"label": "Plays",
-				"descending": true
-			},
-			{
-				"key": "stats.diggCount",
-				"label": "Likes",
-				"descending": true
-			},
-			{
-				"key": "stats.commentCount",
-				"label": "Comments",
-				"descending": true
-			},
-			{
-				"key": "stats.shareCount",
-				"label": "Shares",
-				"descending": true
-			}
-		]		
-	}
-}
\ No newline at end of file
diff --git a/datasources/tiktok_urls/explorer/tiktok_urls-explorer.json b/datasources/tiktok_urls/explorer/tiktok_urls-explorer.json
deleted file mode 100644
index f49132c04..000000000
--- a/datasources/tiktok_urls/explorer/tiktok_urls-explorer.json
+++ /dev/null
@@ -1,26 +0,0 @@
-{
-	"ndjson": {
-		"post_id": "",
-		"external_url": "https://www.tiktok.com/@{{ author }}/video/{{ id }}",
-		"body": "<a href='https://www.tiktok.com/@{{ author }}/video/{{ id }}' target='_blank'><img class='preview' src='{{ video.cover }}'></a><br>{{ desc }}",
-		"author": "{{ nickname }}",
-		"musicname": "{{ 'music_name': post['music']['title'] }}",
-		"created": "{{ createTime | datetime }}",
-		"plays": "<i class='fa-solid fa-play'></i> {{ stats.playCount | numberify }}",
-		"likes": "<i class='fa-solid fa-heart'></i> {{ stats.diggCount | numberify }}",
-		"comments": "<i class='fa-solid fa-comment'></i> {{ stats.commentCount | numberify }}",
-		"shares": "<i class='fa-solid fa-share'></i> {{ stats.shareCount | numberify }}",
-		
-		"sort_options": [
-			{
-				"key": "createTime",
-				"label": "Old to new"
-			},
-			{
-				"key": "timestamp",
-				"label": "New to old",
-				"descending": true
-			}
-		]		
-	}
-}
\ No newline at end of file
diff --git a/datasources/tumblr/explorer/tumblr-explorer.json b/datasources/tumblr/explorer/tumblr-explorer.json
deleted file mode 100644
index d5ad04b51..000000000
--- a/datasources/tumblr/explorer/tumblr-explorer.json
+++ /dev/null
@@ -1,28 +0,0 @@
-{
-	"external_url": "https://{{author}}.tumblr.com/{{id}}",
-	"notes": "{{notes}} notes",
-	"reblog": "{{ is_reblog }}",
-	"images": "{{ images }}",
-	"body": "{{ body }}<div class='post-tags'>{{tags}}</div>",
-	"sort_options": [
-		{
-			"key": "timestamp",
-			"label": "Old to new"
-		},
-		{
-			"key": "timestamp",
-			"label": "New to old",
-			"descending": true
-		},
-		{
-			"key": "id",
-			"label": "Post id"
-		},
-		{
-			"key": "notes",
-			"label": "Most notes",
-			"descending": true,
-			"force_int": true
-		}
-	]
-}
\ No newline at end of file
diff --git a/datasources/twitter-import/explorer/twitter-import-explorer.json b/datasources/twitter-import/explorer/twitter-import-explorer.json
deleted file mode 100644
index 2836cd563..000000000
--- a/datasources/twitter-import/explorer/twitter-import-explorer.json
+++ /dev/null
@@ -1,140 +0,0 @@
-{
-	"ndjson": {
-		"id": "{{ rest_id }}",
-		"author_picture": "<img src='{{ core.user_results.result.legacy.profile_image_url_https }}'>",
-		"author": "{{ core.user_results.result.legacy.name }}",
-		"created": "{{ core.user_results.result.legacy.created_at }}",
-		"body": "{{ legacy.full_text }}",
-		"likes": "{{ legacy.favorite_count }} <i class='fas fa-heart'></i>",
-		"retweets": "{{ legacy.retweet_count }} <i class='fas fa-retweet'></i>",
-		"replies": "{{ legacy.reply_count }} <i class='fas fa-reply'></i>",
-		"external_url": "https://twitter.com/{{ core.user_results.result.rest_id }}/status/{{ rest_id }}",
-		"image": "{{ legacy.extended_entities.media.media_url_https }}",
-		"sort_options": [
-			{
-				"key": "core.user_results.result.legacy.created_at",
-				"label": "Old to new"
-			},
-			{
-				"key": "core.user_results.result.legacy.created_at",
-				"label": "New to old",
-				"descending": true
-			},
-			{
-				"key": "rest_id",
-				"label": "Post id",
-				"force_int": true
-			},
-			{
-				"key": "core.legacy.favorite_count",
-				"label": "Most likes",
-				"descending": true,
-				"force_int": true
-			},
-			{
-				"key": "core.legacy.retweet_count",
-				"label": "Most retweets",
-				"descending": true,
-				"force_int": true
-			},
-			{
-				"key": "core.legacy.reply_count",
-				"label": "Most replies",
-				"descending": true,
-				"force_int": true
-			}
-		]
-	},
-	"csv": {
-		"author_picture": "<img src='{{ profile_image_url }}'>",
-		"author": "{{ author }}",
-		"likes": "{{ like_count }} <i class='fas fa-heart'></i>",
-		"retweets": "{{ retweet_count }} <i class='fas fa-retweet'></i>",
-		"replies": "{{ reply_count }} <i class='fas fa-reply'></i>",
-		"external_url": "https://twitter.com/{{ author }}/status/{{ id }}",
-		"images": "{{ images }}",
-		"body": "{{ body }}",
-		"sort_options": [
-			{
-				"key": "unix_timestamp",
-				"label": "Old to new",
-				"force_int": true
-			},
-			{
-				"key": "unix_timestamp",
-				"label": "New to old",
-				"descending": true,
-				"force_int": true
-			},
-			{
-				"key": "id",
-				"label": "Post id",
-				"force_int": true,
-				"force_int": true
-			},
-			{
-				"key": "like_count",
-				"label": "Most likes",
-				"descending": true,
-				"force_int": true
-			},
-			{
-				"key": "retweet_count",
-				"label": "Most retweets",
-				"descending": true,
-				"force_int": true
-			},
-			{
-				"key": "reply_count",
-				"label": "Most replies",
-				"descending": true,
-				"force_int": true
-			}
-		]		
-	},
-	"ndjson_old_api": {
-		"author_picture": "<img src='{{ author_user.profile_image_url }}'>",
-		"author": "{{ author_user.name }}",
-		"created": "{{ created_at }}",
-		"likes": "{{ public_metrics.like_count }} <i class='fas fa-heart'></i>",
-		"retweets": "{{ public_metrics.retweet_count }} <i class='fas fa-retweet'></i>",
-		"replies": "{{ public_metrics.reply_count }} <i class='fas fa-reply'></i>",
-		"external_url": "https://twitter.com/{{ author_user.username }}/status/{{ id }}",
-		"image": "{{ attachments.media_keys.url }}",
-		"body": "{{ text }}",
-		"sort_options": [
-			{
-				"key": "created_at",
-				"label": "Old to new"
-			},
-			{
-				"key": "created_at",
-				"label": "New to old",
-				"descending": true
-			},
-			{
-				"key": "id",
-				"label": "Post id",
-				"force_int": true
-			},
-			{
-				"key": "public_metrics.like_count",
-				"label": "Most likes",
-				"descending": true,
-				"force_int": true
-			},
-			{
-				"key": "public_metrics.retweet_count",
-				"label": "Most retweets",
-				"descending": true,
-				"force_int": true
-			},
-			{
-				"key": "public_metrics.reply_count",
-				"label": "Most replies",
-				"descending": true,
-				"force_int": true
-			}
-		]
-	}
-}
\ No newline at end of file
diff --git a/datasources/twitter-import/search_twitter.py b/datasources/twitter-import/search_twitter.py
index 7984cc69b..79a875132 100644
--- a/datasources/twitter-import/search_twitter.py
+++ b/datasources/twitter-import/search_twitter.py
@@ -81,6 +81,8 @@ def map_item_modern(tweet):
             "author": tweet["core"]["user_results"]["result"]["legacy"]["screen_name"],
             "author_fullname": tweet["core"]["user_results"]["result"]["legacy"]["name"],
             "author_id": tweet["legacy"]["user_id_str"],
+            "author_profile_img": tweet["core"]["user_results"]["result"]["legacy"]["profile_image_url_https"],
+            "verified": tweet["core"]["user_results"]["result"].get("is_blue_verified", ""),
             "source": strip_tags(tweet["source"]),
             "language_guess": tweet["legacy"].get("lang"),
             "possibly_sensitive": "yes" if tweet.get("possibly_sensitive") else "no",
diff --git a/datasources/twitterv2/explorer/twitterv2-explorer.json b/datasources/twitterv2/explorer/twitterv2-explorer.json
deleted file mode 100644
index c9fb03090..000000000
--- a/datasources/twitterv2/explorer/twitterv2-explorer.json
+++ /dev/null
@@ -1,94 +0,0 @@
-{
-	"ndjson": {
-		"author_picture": "<img src='{{ author_user.profile_image_url }}'>",
-		"author": "{{ author_user.name }}",
-		"created": "{{ created_at }}",
-		"likes": "{{ public_metrics.like_count }} <i class='fas fa-heart'></i>",
-		"retweets": "{{ public_metrics.retweet_count }} <i class='fas fa-retweet'></i>",
-		"replies": "{{ public_metrics.reply_count }} <i class='fas fa-reply'></i>",
-		"external_url": "https://twitter.com/{{ author_user.username }}/status/{{ id }}",
-		"image": "{{ attachments.media_keys.url }}",
-		"body": "{{ text }}",
-		"sort_options": [
-			{
-				"key": "created_at",
-				"label": "Old to new"
-			},
-			{
-				"key": "created_at",
-				"label": "New to old",
-				"descending": true
-			},
-			{
-				"key": "id",
-				"label": "Post id",
-				"force_int": true
-			},
-			{
-				"key": "public_metrics.like_count",
-				"label": "Most likes",
-				"descending": true,
-				"force_int": true
-			},
-			{
-				"key": "public_metrics.retweet_count",
-				"label": "Most retweets",
-				"descending": true,
-				"force_int": true
-			},
-			{
-				"key": "public_metrics.reply_count",
-				"label": "Most replies",
-				"descending": true,
-				"force_int": true
-			}
-		]		
-	},
-	"csv": {
-		"author_picture": "<img src='{{ profile_image_url }}'>",
-		"author": "{{ author }}",
-		"likes": "{{ like_count }} <i class='fas fa-heart'></i>",
-		"retweets": "{{ retweet_count }} <i class='fas fa-retweet'></i>",
-		"replies": "{{ reply_count }} <i class='fas fa-reply'></i>",
-		"external_url": "https://twitter.com/{{ author }}/status/{{ id }}",
-		"images": "{{ images }}",
-		"body": "{{ body }}",
-		"sort_options": [
-			{
-				"key": "unix_timestamp",
-				"label": "Old to new",
-				"force_int": true
-			},
-			{
-				"key": "unix_timestamp",
-				"label": "New to old",
-				"descending": true,
-				"force_int": true
-			},
-			{
-				"key": "id",
-				"label": "Post id",
-				"force_int": true,
-				"force_int": true
-			},
-			{
-				"key": "like_count",
-				"label": "Most likes",
-				"descending": true,
-				"force_int": true
-			},
-			{
-				"key": "retweet_count",
-				"label": "Most retweets",
-				"descending": true,
-				"force_int": true
-			},
-			{
-				"key": "reply_count",
-				"label": "Most replies",
-				"descending": true,
-				"force_int": true
-			}
-		]		
-	}
-}
\ No newline at end of file
diff --git a/processors/filtering/write_annotations.py b/processors/filtering/write_annotations.py
index c7333b0a0..14c5cb6c1 100644
--- a/processors/filtering/write_annotations.py
+++ b/processors/filtering/write_annotations.py
@@ -3,6 +3,7 @@
 """
 from processors.filtering.base_filter import BasicProcessor
 from common.lib.helpers import UserInput
+from common.lib.exceptions import MapItemException
 
 __author__ = "Sal Hagen"
 __credits__ = ["Sal Hagen"]
@@ -71,7 +72,7 @@ def process(self):
 		# Create dictionary with annotation labels as keys and lists of data as values
 		new_data = {annotation_label: [] for annotation_label in annotation_labels}
 
-		for item in self.source_dataset.iterate_items(self):
+		for item in self.source_dataset.iterate_items(self, warn_unmappable=True):
 			post_count += 1
 
 			# Do some loops so we have empty data for all annotation fields
@@ -98,9 +99,11 @@ def process(self):
 			if post_count % 2500 == 0:
 				self.dataset.update_status("Processed %i posts" % post_count)
 				self.dataset.update_progress(post_count / self.source_dataset.num_rows)
+			print(post_count, item)
 
 		# Write to top dataset
 		for label, values in new_data.items():
+			print(label, len(values))
 			self.add_field_to_parent(label, values, which_parent=self.source_dataset, update_existing=True)
 		
 		self.dataset.update_status("Annotations written to parent dataset.")
diff --git a/webtool/lib/template_filters.py b/webtool/lib/template_filters.py
index fe3d3208e..14b2ec523 100644
--- a/webtool/lib/template_filters.py
+++ b/webtool/lib/template_filters.py
@@ -12,6 +12,7 @@
 from common.lib.helpers import timify_long
 from common.config_manager import ConfigWrapper
 
+from pathlib import Path
 from flask import request
 from flask_login import current_user
 
@@ -175,11 +176,6 @@ def _jinja2_filter_extension_to_noun(ext):
 	else:
 		return "item"
 
-
-@app.template_filter('post_media')
-def _jinja2_filter_post_media(url):
-	return url
-
 @app.template_filter('4chan_image')
 def _jinja2_filter_4chan_image(image_4chan, post_id, board, image_md5):
 
@@ -268,7 +264,6 @@ def uniqid():
 	else:
 		version = "???"
 
-
 	return {
 		"__has_https": wrapped_config.get("flask.https"),
 		"__datenow": datetime.datetime.utcnow(),
diff --git a/webtool/static/css/explorer/generic.css b/webtool/static/css/explorer/generic.css
new file mode 100644
index 000000000..196502cbb
--- /dev/null
+++ b/webtool/static/css/explorer/generic.css
@@ -0,0 +1,147 @@
+/* 
+
+See https://github.com/digitalmethodsinitiative/4cat/wiki/Exploring-and-annotating-datasets for information on how to add custom CSS.
+ 
+ */
+
+@import url('colours.css');
+
+@font-face {
+    font-family: 'Open Sans';
+    src: url("../fonts/OpenSans-Regular.ttf")
+}
+
+@font-face {
+    font-family: 'Open Sans';
+    font-weight: bold;
+    src: url("../fonts/OpenSans-Bold.ttf")
+}
+
+@font-face {
+    font-family: 'Open Sans';
+    font-style: italic;
+    src: url("../fonts/OpenSans-Italic.ttf")
+}
+
+@font-face {
+    font-family: 'Open Sans';
+    font-weight: bold;
+    font-style: italic;
+    src: url("../fonts/OpenSans-BoldItalic.ttf")
+}
+
+/** --------------------- *
+     Posts
+  * --------------------- */
+
+.post {
+    font-family: "Open Sans";
+    list-style: none;
+    padding: 0;
+}
+
+.posts li.post {
+    position: relative;
+    background: #d1e2ff;
+    padding: 1em;
+    min-width: 640px;
+    max-width: 960px;
+    margin: 0 auto;
+    margin-bottom: 1em;
+    overflow-x: hidden;
+}
+
+.posts header {
+    border-bottom: 1px solid #000;
+    display: inline;
+    cursor: default;
+}
+
+.posts article {
+    margin-top: 0.5em;
+}
+
+.posts article .post-content {
+    margin-top: 10px;
+    margin-bottom: 10px;
+}
+
+.posts header .author {
+    font-weight: bold;
+}
+
+.posts header span {
+    display: inline-block;
+}
+
+.posts header span:not(:last-child)::after {
+    content: '\2022';
+    margin-left: 0.5em;
+}
+
+.tags, .metrics, .external-url {
+    color: #3b4f9d;
+}
+
+.posts li.post .post-content a {
+    color: #215bc6;
+}
+
+
+.posts .external-url {
+    position: absolute;
+    bottom: 0;
+    right: 0;
+    padding: 10px;
+}
+
+.posts .external-url.deactivated {
+    color: #cbcbcb;
+    cursor: not-allowed;
+}
+
+.post-media img {
+    width: 100%;
+    max-width: 300px;
+}
+
+span.divider {
+    font-size: 16px;
+    color: var(--gray-darker);
+    cursor: default;
+}
+
+/** --------------------- *
+     Annotation post elements
+  * --------------------- */
+.post-annotations {
+    background-color: #bed4f9;
+}
+
+.post-annotation {
+    padding: 15px;
+}
+
+.post-annotation > .annotation-label {
+    display: inline-block;
+    vertical-align: middle;
+    font-weight: bold;
+    text-align: right;
+    min-width: 150px;
+    margin-right: 5px;
+    line-height: 1.6em;
+    overflow-x: hidden;
+}
+
+.post-annotation.checkbox > .post-annotation-options {
+    display: inline-block;
+}
+
+.post-annotation-options {
+    display: inline-block;
+    vertical-align: top;
+}
+
+.post-annotation-options > input {
+    display: inline-block;
+}
\ No newline at end of file
diff --git a/webtool/static/css/explorer/instagram.css b/webtool/static/css/explorer/instagram.css
index 27623f0ab..e0e8d2a64 100644
--- a/webtool/static/css/explorer/instagram.css
+++ b/webtool/static/css/explorer/instagram.css
@@ -37,12 +37,23 @@
 
 .posts .media-container {
     width: 100%;
+    position: relative;
 }
 
-.posts .media-container > img {
+.posts .media-container img {
     width: 100%;
 }
 
+.posts .media-container .media-bullets {
+    position: absolute;
+    width: 100%;
+    bottom: 5px;
+    text-align: center;
+    font-size: 17px;
+    color: white;
+    opacity: .6;
+}
+
 .posts header, .posts .post-content {
 	padding: 20px;
 }
diff --git a/webtool/static/css/explorer/twitter.css b/webtool/static/css/explorer/twitter.css
new file mode 100644
index 000000000..68fcd7cfd
--- /dev/null
+++ b/webtool/static/css/explorer/twitter.css
@@ -0,0 +1,131 @@
+.explorer-content {
+	background-color: white;
+	padding-top: 30px;
+}
+
+.posts .post {
+	font-family: Arial, sans-serif;
+	font-size: 15px;
+	width: 580px;
+	margin: 0 auto;
+	background-color: white;
+	list-style-type: none;
+	border: 1px solid #efefef;
+	border-radius: 10px;
+	min-height: 50px;
+	margin-bottom: 20px;
+}
+
+.posts .post .post-table {
+	position: relative;
+	display: table;
+	table-layout: fixed;
+}
+
+.posts .post .post-table-row {
+	display: table-row;
+}
+
+.posts .post .profile-picture {
+	display: table-cell;
+	width: 11%;
+	vertical-align: top;
+}
+
+.posts .post .profile-picture img {
+	border-radius: 100%;
+}
+
+.posts .post .post-content {
+	display: table-cell;
+	width: 93%;
+}
+
+.posts .post header {
+	color: black;
+	margin-top: 5px;
+	margin-bottom: 5px;
+}
+
+.posts .post header a {
+	color: black;
+}
+
+.posts .post .post-content {
+	display: inline-block;
+}
+
+.posts .post .media-container {
+	margin-top: 10px;
+	width: 100%;
+}
+
+.posts .post .media-container img {
+	width: 100%;
+	border-radius: 15px;
+}
+
+.metrics {
+	margin-top: 20px;
+}
+
+.time, .metrics {
+	color: #7a8a97;
+}
+
+.posts .post .metrics span {
+	margin-right: 60px;
+}
+
+.verified {
+	color: rgb(29, 155, 240)
+}
+
+.posts .external-url {
+	position: absolute;
+	bottom: 10px;
+	right: 10px;
+	color: rgb(104, 119, 130);
+}
+
+span.hashtag {
+	color: rgb(29, 155, 240);
+}
+
+
+/** --------------------- *
+     Annotation post elements
+  * --------------------- */
+.post-annotations {
+    background-color: rgb(29, 155, 240);;
+    margin-top: 10px;
+    color: white;
+    border-radius: 15px;
+}
+
+.post-annotation {
+    padding: 15px;
+}
+
+.post-annotation > .annotation-label {
+    display: inline-block;
+    vertical-align: middle;
+    text-align: right;
+    min-width: 150px;
+    margin-right: 5px;
+    line-height: 1.6em;
+    overflow-x: hidden;
+}
+
+.post-annotation.checkbox > .post-annotation-options {
+    display: inline-block;
+}
+
+.post-annotation-options {
+    display: inline-block;
+    vertical-align: top;
+}
+
+.post-annotation-options > input {
+    display: inline-block;
+}
\ No newline at end of file
diff --git a/webtool/static/img/go-to-media.jpg b/webtool/static/img/go-to-media.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..bf8be3e3de46c8ffa1bfa1db117c3c82ac9ef396
GIT binary patch
literal 18550
zcmeHvXIN9q*6<F!OD_r<f`EXO5PFpkN^epWNdrPjAPGf4Pyq!5=|~gwND=8G2m)fE
zNYMi#MWrYpih@cJySzK0sONg`J?HtJ`|ES-o|#o<&01^9o;4X=9a(+KkYf~r^8f$~
z3qTqG049JD!VS=ZB?$NjfNTZmsbv5_LAGs{y&=2Sd1yc$H~^-=4)7&H6scuiu$*!d
z2hgq8y$I$H0NGpS03gY^k?UayC@j?4i$Erjya@hKMMXI{bk}|y9zzHwNdbxqib@&^
zN*am?C>*Y#tg4};3Q7S0+H;V#wpUjH;2h0H8#K-5Xup@KoB+T8p<UOL;YV8_u<y1&
zq<^%92C^XoqDZqTo8En0o++4vUID22>d5Lmz(%#=kbwn2v-%p~ph5uVD1~qJF%S)~
zF)=YQGqEu<vvIMquyXNjVPo6EC&<IY$HODY#YTPBvo+waDTJMsm7Rl~lY@hkmxF_Y
zmy&Ssu2tdsI|5d309;JKIp7>E1P0J>L1?)kt7D*%EURr0T~JGU2=xIKraZI&gociu
zfsu)sg%unCKNBGU4ebVz6M)b`XldwZ=ouN9=xNyyAd!ofPLvx?uV;hgfdw2>WZ+FX
zU!yO!lh5`h3ZX<i-fzGto@)2@=>nRh%>T#bC<((fdyLVo3uG06`wo-X#oFlG1A$XZ
z#**oUF;_C$9!xJ6)wK^kd*|p8bRx63zGG-+Mb%`#XYk20C09F#pTFk<XlOuf>8Ls}
zGSDkiRS<=9(}5ZUz<B5tkEQTZG`MNo{}!<m^^|x#)xeGqO;TDAXP_v+D4~4oKB&Y6
z0gRD@3K=`Oslvb4K=Qi^tB(LST56kIfG%*{Y-ehEs@cw>ovCJ0V0o|YzeAL)_Wcv)
z{(r)J|DP}q{LC!lG?MYystldaIIEiroQThU6_I`KlTKA%sd7UC)=KoXs_ml!lIb?s
z$o;EFv<C?uUuUJtQ|m{)cR0f7?~=;uACCc_edNcgtp--+D)yji9y$%eUoT&d67E!E
zzoC^Waw9AlK2O~1Q&{meO=yr+=JKuQ4t@z7K6`T5T(65P`>q1?tH5!o)9F*Qs-&Ff
z*NR*2c|GJ$TrwAVv#fgyZ)U;FJUCycxwY0HsaUGpUq!j-5r5dh<=j=^fM=iHRbB*N
zHW%8=ALn-z2v&W-?{Iaj_VG6^apn7FJJSnO&BUd^uCZEu@RQYz33G>PRhsU4Tz=mq
zT1Cor?^CI&z1(uWe!lgow$=TJN!G==%LNv_USW?q9($erlW!GJ8uqDaicFrk4=qeh
ztCj8bvl!{zYVe4;ATS93PAY`s@zsgtRUpq?F<()-rQ9dt&uRy^CIja(*V6Gf&ps#H
znwdGLuE5LLactPSxO}-!o&|Tmw9D6}VE4yAQuN;aRi&`OQ3Y{?H2ZanUo7*^!q+|G
z`MJ{A!1(#n3j7rNHEk=>=yUb)36!(Y>6S`{`s$IDhv;0z=XKXJ-vq2&^_+qJWh6$f
z4*q2Ge;J)a|Acw?pD_QUasP02p5wvyuLD9(0Z`L7v!D~FE>xV<^oN8>)1V+ALSRCn
zQ&wsKVD%^B@MNn%vVR~M1Sy|0z?ww1LlFqRR1Pyd8H>dS`cVj!G9@?gB~sum)Vg-K
z5DJGrj!daTA+O2A5-pK_SbHNUdl0n^FaVGM9w2}(6fgr*GJpk$V2%Y#P{0~2<2JES
zzySyX^eD2{8?h!52p)D=a-hEz${P)$*{D?rpg<3>x*b(aAmC4}!s5Tl4Z;u#PT!Y|
zU$=q{6^QdC<M33GAc>jEZD8TNZ_R_66n+|6F!}B!?jO8l<xj>D@Fb8fK;bm-C*wB^
z2OWbVZUVNRB#TYJfQUEv4&ce(0dtfuX_I2<Ne<csjQo5JHv!Q1-$m=AeLUA4K{XEm
zY)$kHKy4^n6TlV&g<=SSs6B)y6p;Fu5b^(l)A#*XK7As_-T_ZG7PIxGjEBuf-xmY@
z72cNQOQzzjLwxrTuT$Jcd02w5XflClh$JH^7EqP6_OvE#j3i2#!UyIQwE=Ui$C{YG
zG;0s;e16re9olydZ%st+ai&P1KAdPG!GFIO7VKPbRO9fT8}8$v5Ntu6^a*4#!Oxd~
z_gt^Swn3q&@B^NG18<A-^!ky&x<LSS+vKOrFWUJv9UwB3ec-5rY~<9rCqbogtTzf~
zx<`Sv@y1TYQTBo^R8GqL1gQJ0?Nq5tRG68Xf(e9L3*0AC9+vf445F#L@2FT)K;FXx
zr1*j<xU&Nv$U5eHcFrb7k&1T#(+wVFDu&X9>lmuc4<HOsX4dL2AKDib@-IoN|1UJE
zFDRnH$+%`69c3TA=9x9mDNtb=2-SAFwV}sA^$0DsHWQV;K8tDgfP%nx-Rs*aC90R#
zWNoE@v}RzkW;+E0Xqwkv(9aIk#R&&4R=(g`g8^g#ILNR*`KZhr$q%^A`WrNA{sY4%
zeqBilj1TmstfCZS8Bl~kJcjg_nT{q?nx+gGfHFNc#~=v*WefxQn`51#3b-<C4rMCL
zK=Q?*u_Q-dbIN3d{L~ITl?I}@!IzN=m>K@i7NaMT5a|CCih)4HdEz$LStC1&bSo+j
zl);V+Boj=qcq|b~#$rGOV1L+u<I}>j#!SIbC}w`1P%!&n+fN%v^j-fzt#=B=`9EVU
zNS+%Xu{9nBq%YYX>G=~E2O103!-kN}NT&7{<{L{M(+2t{W@aw}F;veN=egmjEo;V`
zZlFQS88KK7WFU13W)8v<$-iKC+(7@#&WiFhAovo9KRB0bO}@U#CIPGhSQ79QUo(>l
z{@~}Ig#B@iWAOzS!+*iDp{R@Jzu;NHF82B>J9P%`qBI14tHI%{`<7Z>n{8awGGuKn
zqoc&bYXf?1-ctAh`fxCzc>LVj3wZ)Sc7sKFcR&z~9)CxdzoX0F(dF;x@^^Ikzc^KS
zNL{@}!Q~bJ_Jc3DQiG8g7?EMXfC~j~fG8207g#2PFNs?62RV5E3kMWX1Qft`ZR<pB
zV{Ny^2rz+7fPYt?a$vm3WPc5Lc|1uDNjU?RLlgYuLy-RRa5)8eKub5oABpzClA$Op
zxYy8zy=iKNL2(#u*nVXT1q*)ztQXESl!&zpwX{cv`k>V@FkKy}R)|K3pT8fLjD&{x
z`Qk|$A=)sia1F3bLCeFSR2H(2HjH{o33alth8hrvSg4Aef-G7=0RdH2mxHS)DJ!b(
zhJvTB3i5F9ud*y0JbG2rKqx}j9~jsg5#ym@YiPXQ76@s>){P1d4wef>$PtL1@^E!^
zb$JCvc|}E8kVBSq7*9rq$l^(28xjn$Bs396jW(eaiC`oZNY;jdny#7P=f5fTm%jRe
z(Zb?Cv-<f_96%LIA{zyPt^B7>AW`-f^0rtKA&`j18U<nTWU-Cv{u>9@zZS7cruODW
zB@B9#+aKJLQQd<<%VT}9exPg;$P1_H=dWQv#3I4C(jJT{H#+M3A;`~1<P_oS6|i6&
ziq(cu&dg;Mlw=hV_8VvBG75?s3JPni7T`F*fC1~@V<up59*6%9tA&Mz8J<K&;?Y<$
zLv0wSlpGF+(Lk#xtE1J`6lE1LN(fn`iju0VI>G}ki||lVQ%5Uc(5i}Ps$O7yLjpRG
z;t5LqwPA}Ppg|d%-qcX@P*=t(gSIKCppdf4>PlEyHC3dVtU3x5gi=yfKq;tfNL$;H
zBYl6AO3?}fN>or%!KfjTNLeLSEGP%1gpoz6Bh*2cVO6k72sIUDMRgeTd%M&fr3C>4
zdKUqMV$d2M1fm}j96vZeq$gJ1+aK#m8P1@1&<s-?3GBPW>l4Hd8?avT#X+f~P6LUi
z9Ia@>(3D=rVqoiN+%Mes_Y_?d8w?%@{Kwv3V<Qnf$iYY=R?ic3^?w~*^8ZeG5;EvN
ztN!<T{(q?Yx?9m+NW3Q&Tsh@o6l3HmYs^}w$p5v!zIQk!$T3GA#u7>E_Uj`(zkA|4
zAC-r40H_V~BtXdos2>)CL;g5)s2mh=KXYuVvL;ymyU3rKrfU1c3b|nbxa?Ajo9n&K
z#=87F|LuX_9{BBn-yZnwf!`kZ|HuR1BStJ9-0TE{@gR5?MU)!l0hm3M=$01Xq@|%<
z3q?Vg@=&j*tX=;2TPe-=hj#P%p(f=r2F)hn?-o{Xf!9-LnrWJ8A>e(Ewd*NuKm~X%
z3f|lSrLXA<rr?zn@a_pM1pF~%6SrRA1TTV6Z-OvTe;fo))fwrTXu%&-Ae`39s_y(M
z0(+`&@T!?|^9fS#jDXixXz3Ujm>9wOG_(*pdM<7TfJc;<j}Z>z=hV9rWNm{~46Jjn
zKBj-7V^4ss*v|IglLlyUWh1Nv_g`DTv33mv0^WY1ocS(3Sy$?J_S*x$J@DHDzdf+&
zfl#8eme90i;U?lQ1)5PlVJeEQ`oJv?QT?_1o_}Ei7@_^QA7&UXizn}zrOjU+S<<){
z+hAlYJ)*>vZkmwGB5d4|z=bnn*HAkN#Ki7C+q27t$oE$v6qvcNu7s_^<D?M3k9rC)
zjD}f6E7A=wZ9jIULu(t8hZG@c>z?**X(Wo}iE~t$bSXz_3b}XaSCTkB42zvZDBZ6W
zw}^(%ZJooIRd;CRGNDJ5NA9E@On55S|5jW5CiFtBOeu+@{8G`7j%QnM->)RiTUhU!
zrO$cm!F7SbG76cHu8iruZN+w+p-G6_rzTJQ5v;Z&qwjtDluV<th_i!*DB?-IvB%M0
zh@x9@Nh{K28FHCC&GkXHs}lyB<$V$h!FR{HUT)yQ9lXn`<`H}*c}X%&t<c^TQKlv2
zvga3KXr7<#AI!Uz^b69Ds4X?P(^o1O*M9;1k@CCz+XKHn@Y@5wJ@8*W5DA77Ob{?^
zp$EPa8L0qxAOAZ*xsVThdDvHzH-kN$PIt@ICiIfJ*SzJ;eF?qc3<8Hvl$5=B!lko#
zXopdG()lZ^0RQX|zumTbb;?1(_2*u3A935Nez0ttgvO1Nf6$?2>fAee5^qIcq6@Pk
z`~HCnD~ioCyZOKgs$`qmDa-RDIb)`IuA%<0=%olL8TJG9_QK6Oy>BdcySrQ&`YPOU
zWmrYUoZY`1A$TZmwDdOTxsPf0Prn<lyEM7o!9Qr1760zW#i~ovPrUYvgwqWouIrp~
zNJGkA&%I!S?7j3h$yLGl-2Fkp=VF7p^|=a5p|46S8s@KuJwBjdJ=Rh+>hxkxE&q)(
zhqw0|9;sGFNz|2hg36t@KLz&<S-zC6{S%Fr>kuBz<3r!tm9#6-<0y0@TB0^)+wr*2
zd8sDu+3Q<F8gKOLHESC3`{w#)5A2mBS;=xk)DAmqK5%Y|vMoD9U~QC7`lBzuT(#v$
zt;bZ1fK(mlWXq?l!$}_-YB=7eh$inTDsql*E0^BmcGI{kOl5ZAS(oyI37xI)wt#93
z1b;y|<d>b)**che<HCpF!LAh>o;Tw2pDbg8rxu@L^Nf>ty$(z#?DwA4-EM{>?B9Mc
zX{~3@YdaA_2ksQz>L<ek7^Zux2=^lh^p8}VpPk>oO)<LG!E^k&kXqxegNxdSK1npN
z?;4?N&ulxbB_jSrSE$~~?rMh_tj^9gheKoVN-5f#XZ!PE6URd|Nlz#COFlHTYCdyy
zRzvGx1uwzR@&07_&3;AOu|iQxVCZYVi$(fP2R>)I+Y%o>$uhsZ9X5<1UuAmLj%7`e
zC%8=%ei^@38u<9+-5%Ktg*}%Wi|LGq%Tc-<<lD<N%C$}5qD@|xontKY<>~xNlO2*;
z=jzLn4_l7QSX(Y!=Zw6Jzji9mcAPu2<ZEhhiFoPfdTu70MVqwo_OGwvVlz{ZrSUX#
zYQpK?Tce{H_pzSeEl=~Xe%iC8fS>0LK9%eJif*w`@Snw9tik0hyC2p)N_)FSlke^z
z>&$X;j*93hOj*-bo8*{sUmedg*kdAEc5jR8?{(zgn7>2ZGt2)tSjw?PCwy_~%S6H2
z9HL>+1vaM84`m1h#DA*~P72stpSHIX+Wnwi;_;1<oVofD>$-ETc}FDzvy!$~^DpK_
zZOak<n)@!5SNc(<L-+AArLHuT2a6CtO3doSS&a1CW~J#K-V9CH%4msxg<j2A{EZ%g
z{Ft8dR1SavQ{sCs8IBNQV+cJ})0p6pRvK|?T0CQXt9e3BV!S(NFJ~4PB;h7ouesGS
zPfA!_X6v2)ybJr%CT>3{@ewRX^hZTi#$7o%S#i);<Ygtlsgq+37Z(@&T4bMg-Iu2^
zUz`HxI~OdCKA7)6P_nJN=A-J=Qg7#28EK14MXvI^wL=}8-~O{QQ$La*k}9rUEA+N+
z2AxiFd60d&Ea&R}t_R`KBL}sDR5B&bJI`}n^wX(>py^Dbo$NS`xXQofR2b(tdyMrk
zmpNWDwy57d;$FmTZ4w8sP~%Fv?pd^0b%fYX>|$Yr7T9#z-0HuI^w`%v#dR#h6Cn^a
zlL}RQed{Vsv|-`Nl(#WsbtlDgVgxG<&T<+RG3D}k*ZWUol;9ovrCft79||3dJ?oLE
z>r1k1?Y6tKyYF7V$KLtH3zivSd#`Zt57}})n?@oFGe6!BYHQ=Yv1Ez!eytl1pSKhf
zNfI^sGV6dWzI)-ypprM54<l{^-8n3nXIRO=RK8%&Uam(}Iu<1m^`2pVZZT0GwZx)Q
zt1g}3M@)3x?sOi@MbE`kIB1pYflio->Ag8)CX)HaFe$48vu9^|bBqoA>}`cD2W}>?
zy#Zd)wl6gw6L#fE!_GG4)heI2A2bmN6NiWs@9w?PQ(nA_2GM6xqiKmoGSa&3YrS#d
zv}<RR++2~~l3@ACi1yJ_ii7P+QN1FWiY}ZoiQ`*M4&W#18$&;84lmZ1P2beYuMEks
zaS)A{vIxo>K2Dy!hVFbi*n7|8S)K`NoAcJp3VY6XZ1o1H(U4JGqhDpLQBa(En`CVW
z(=jviX#WmXR^dty86(7dR+!ewyWScy11)5q!;?vZR+(dB<u`Cse54QC>X)<-;@;i{
zmAfUyqOR#1$7k~o(+^)hwU1`riPPSU?HHZqEzgIy)FoLiq<E@LXH^mKF4*#1*Q5iB
zhLbIBPaRW^S-2F1=egiBF0z`Q)!4oxqkHP(Nfq1E@{W-REZdV7cN6#iPXEK9mY33B
zmD4&-LSjdarVt$q;X2Qry>;6ou=^QN0E%~+yq%bFd60x*G47jiy&+}xW$|ubSD$J|
zIfF>Lx`dDLMd6rB!YhMw^NM4S-;AD*@8QrXO=CbCTokRWDS63e5oUEKV6MfvE8z1A
zGQz9u6jsG+EQ97ju$QVS48o<Lpmr=ZMOc;l#!lC?yRPxGnS^LZ>th;&j}2)&_@o@1
z>(Xcu08gR$EKkxF^5vIzrWfIW0I4}jGJ468o7*voFYbn0;oCxPH2v;ekrsXC+;~;Z
z(MYe?^(!0|w%ybI{oy3OEi5*2yRS3KjG_x1<8^{$ix-vJvTAJ}`RMa^s`t{1OVglT
zhIna)>Y~kZ*=Arcn1=`XbVknmz^K%2zCDeZtm-{Z+qIAKvsq>4lJV%gq5Wyc&{-dx
zg}ubMCJzXV7GMzeq>(%U)v1&_)lRPs4yD*v=son#n45+-DqUN;v0xUU{i1rvtdJ?X
z!vyy^;EP>K1P|Y^=BIWcfg`%httE%qqIS){J9M#mBG_FZ_>hI;SY)u-*CQkjcr>Ft
zUrf*C49@Dw>ZhjS;v&PmdJ5C0!$a>axrO!YM!yIcjq7iXXSJ^;&q{a?*Bp|}xOXtK
zGp&xpJ(dn;IDpu_>p^JK(aGH%hk6Iid7MwoJbb0T$JPb<s*Z~!>mBppjYv7F??JRe
zWU#-LP0W&cm-}L1i%Ak@6w1HDR!I7Y>(Pc7_LuUBC1RWsbc~#uD_dqG@pgC`d!dDT
z&O32C5c+4E9u&hobV3>r5)tqD&xT<Wc|;4}6PxaCd95<IXX-^INdtH78ve4_ib|xX
z+q3?0(q5kS`IYD7ilD-!(sA7}TH*!$&)ot$=HC5Tigu)*sU_#h*fZ&kccT`r%9?Z%
z0?y2DpUqCR^zbYTiO{mKxY*|Y!C3oUx4oIec?;Gdew(`M*oJ31?vu#mf!99e=I@ev
zC(G;e_X!YNKB=T)*sHwDjP7`G_L`e2h<!ccG+kY|FD-sbYf2Cey<z4QZ(h60xA)#a
z+w?dsTvy3FJYwQAVikaWmc|BjyC^!nAxI|Ez2GT<PAebL^Q#fRw%xP~b4x<P)8+th
zFNYj7@?LR3*ii`c$kTmJsK;z(QVOY-tyx$cY!>pW$^<RFuPM1FERy^f87=pEak~;f
zB-qKr?n009i4%%}aXi^c>^h@6#4D{;-d^u<ud;z9AA3jd-|bdFKbj?Uyh$c5NvPnO
z1Uasa82G5}rC{%%R`25z+hi}dhn=eL;9)<G_lflknoRs$;=GXG=+{fSSNA2^=EhKH
z*U@$kx0gO{;x#VZ57oNDnY}f5+g??s=!{=Wi!<`kNjMnGoh4w~9kt7KAA=T~nTiBT
z(&>0?L4x7C`Eh;su))QTFOixN_`&Xw_lnSWZ$%z3-ZmRv!A;#e?3%uF)EFF<e|Tuj
zYYm<Xs^l(tB2kCpN9c-F+}Dx_oiRe)RH<iq5ztJ$%3E1<K%=AS{zaH<OK*BhNy*88
z7@Zv)jk>&Df94e(zgwf6`f<FvoybCbj$X0)MmK7aiF@DEQ?ANi&HZWAHFJ@n-dPmU
zdh)dGT?xV7HsO0F!dY2@1*fm!c!rsuI^=m5g<sZ+l0GLTc*VYpw?@X}Vd6`hYh8El
zU62G^yy4mbITsI~)=gI)4VKy8pkF2$z`px%GCroqd-|rKI2=LSfVLPIzPh7bUEnGF
zU|7{R@E<a&y?d=YBN?&1)#QN79RsHlI471Foh(PEG|W>yl3aa<JL`dax7hoVLw<ht
z$cm&Hr`@hQnNIAj#4q~47tq86GT1kx;cv4FL_$|qzC<66|CsR1sMacnhnOLtrab(_
z#EA5@pjhER!8Z8wY9ViR?^}lY?AUFrk;uzBil=C}>rLJtE!O$gaPJ`hVt7by<ssVC
z2!-x(^?Nj>$tB(~f0PKksELnFwc8^i%^-VuM@vrLu#B6XYC5XwU>r-`QueTe9baPS
zqykGFcJ|^Q0WW-!6__a%B?F5KJ+)BIx##eT%!Xs9U)q)lVO6{3cA}i(2bZm1ZM~D=
zGu3Ar$IB>pGo{zC7q}aKPLH?qXhL?Ik8XI|kyQZI+)*|#GRN(~#Nr`5E`D17xG{t2
zUN`i1_~M@lvn7vL0guS(Lmt+bqPzDUU2wd}-zdnd=bp>MD)<qCOZ2?07yGrqbDLkH
zV-ZUIcJ5Ak4GDGKUB3BugkVDZ6L37`;X02dmOW!5VYiN)X@v)@0&S{QevxX(NPZE2
z-Ck$;XBUPW#CWz%sm7!xwc@NBA&AcshF89gH}W$o&#y$x9AeGwMqLi(YY#b+;o74A
z>e9n2W&DjazAcWzrEnEMh}kf??Ujg9K)8Gk+UWkyr{_D>GSqyddFkq97Bg<|gqBzY
z4miKkyfZ8NqNof1IVQWkcrug!K|)1tw#Lh9CTzMy_pGA(6DK_$9!{gJKDR9m&#}g4
z#m#D(Mj;~1-LG>SFd*RXeRfoBXdSk(%wJmto*Y_Aa+~^w3V!-(xqz%aaDJa;MxGi`
zrY5&sb)h*ZYLKJEvJ!f;wDF8`U1&;0;wf=fbGD9>jt~2JR|c<Ce>s^t>2G>RQ{IHL
zyC|opMC5r7W76&Ma!j3;iynW6X&rliq43)^FMUF@3nhL1gV^w*dscGCC*Q|b`3r6d
zNgobQG);YK4t;8Wq}D;^Ljq#Xspe=|O6Qc1Tjk4ipC^^p*d{)>=OF?6Lx6Vw>BsNb
zBSe(vp0Gpr-aR2TeZaV<a1W-K&+Yn=`t4n@Zvk#t0aILl1*(=;4z);j-PM?M$1!kU
z#z8I~nIpkRl})bvLC#7<p(1$t@_GB_hJ0uf?+iLC2QWW#QWYIuh#ttI+xB`O&<XLF
zlRfGzEVQ&uW+8lD>lFS{#md39vC0Gu@v$Z$wlv`)xt6`>>Fjpev|5~Rh(-Cly@Utq
z?v!wK!Y@7$U5LGzp?PL5e$rSInqyW;;)3&F(6^nkK1-n#%5odolCC6n_E%oa)>Nl|
zniE^xFyKt`leX6l`uNpN+hVR{$K5Hf#G-zgGYk7?2e(zYj$R_6+5?Ma;mTR4mf$<(
z#bSDAY2)~!D;L#TRsq7Bo92oELoaNlo%WsJD!r4qL&`l|H&jCGV2i*OPxe3aBf#*h
zI3e(i?9A?%kI}C5cUAh%x;OXkKnZA0p9>Gg)32ncD+Tbq{e)@cT*^DC?NC>xk!7Ax
za0Z*JGAZdiU+jSj5Bw|>*`F`^<rMRZ+n4svRe*E6Wj%Dbu?oO3t(i#&&cr?<OCNTF
z)isA?8?s*33Yk~&k&_5~gAhp$)6#*(4L=pZHagiHUO0_<F#U2mJykkmvE-n4H`-4P
zjj9wWJbU!`KtplI*nP$exP|iV`;<PtlzbG==k&~OPn3{+6AC7C-?*Lk4escd+Nr7o
z0_uKa@nTsCe$RPd6xrQ#tcf8_l(z-Ruz#Bk54z#QRc>;?Cu?CYFfaE*<2260yWec2
zQ7|vsF_aL`yK`uNr}IVYyETep=<tqySi&KA*W~M|kWNm?>4VAFPuw@^E!`jJ>&N07
z!k9&z*VsN*0|P3GV8)zoqmjJRhoaj9U$=e`_%b!zunJ_I{Sf-qBfIlvW}1@PJ-m&R
zvgFfC;~f(udgN2-GAGnclTnuJ&(|iasvOEY0(=uAtg>3Fi|hC%O)Jq)?+&=v&+NT6
zR5Kh9-o72M6W+4Kzwe{hw=7{tr}0Ylm_MA$=Z2;1D$WOSxz)>B?Z|1IsD3dTa9&?8
z6~S2Vexz!pDL$YG{o(9h2RveTUfN5k6&dzFMfHnVKXgRiF22>3gw8U@37fv!)^mE{
zo@>X4HuPZ*v`F-}14q=oRd<baJF_^HXHE2AGjI^>(`FdI^|oROPt#XX#Fw+S#QiP%
z8B@KO;zo`3loGa@rwFELv^+mOnSHhxzMXb3ed>PR#I`#PzL=0_&o5uw+sh<JKv$Ft
zJH498h!t9S#0C*mkm5{%b$&jarj=d^6$n;$uy>qf9x{9J$8e*M=NK!q+UHvl?9<`J
z_eWkFI&(hhl0+U}B3(#upmP|3qj}HUC6rPZGiju=^+;43qJf|2rt8vvO7M(hEAOIH
z$`#^g-nIj0_BWc&b15)jjcbu#+7w}G<Fbj9Nw=&!pG!YKB^oBt<T!z<*Of}zdclIv
zF}=jYJl^UW#umQKq|@=)hbbOZ(GwSh+P<+{omlHsMXL};_qNAnyQTIW50*2%FhirT
z3$gM_MriwZkrQco=1kb3(NhD7Cs$^)f)5{wm~V|VZOq?k(~fRRIXU&=BL=<jcC5NH
zTdC-(e1kr8G&EcZ0{Iv*7-q37>3Q|G!WWa_Zx>(t%-T!YzT2|*Q()-fB*~__nFxzV
zgtN-|1-Gw9e~N5L>701C<C6A%LJG3Lr~8Oz;&4-w#n{N>PbHPgJ>f~EUp2FeGQs~1
z05RSCJk9HB&&O7ILeG1pfk@xwuXaO>0?WYzI&mqrk*)8xlz*&(yU46Gxt)xkn;SUz
zIO))dl!cckJlO+3%oQu%ue5G)TG6RX=iNF&KgCnnnP6_3?!&{w1Mlm5FLnZxOX%hc
zt8W;Yx~xPjf>p!tV)BHoO2h6<pN~z1w6)=$xX||sT+QQn2^*AA>KlDuGU;W#bnZiS
zf1CT9+bLa5OnrMa9+sM&s3cdq@Qu)Uwrfhr*<qLGu4#r!n~Y@6D`mTK_Yw(O`Y^3F
zxTk`yTDEy!fBQnOO#oigN;0Y2#n~-a{lGv#=0NBr*@w2aIb{8A!)G!)sR}|8=HX`p
zx+6`$B9_0FC+0t|`ZS#Tj^5t!ZvLsb#jjN|mDlp;Czu90Ggg6*R;AxcmgT<%_lGC_
z@#)W1z#?-=$8)fCaV2OKXlc5gvb4i$x$2^#WaL2eME+v#Q1$%bR#~fF+~eV3FCATu
zQvYv<Pxd^UMe_0aXNT`dJ@RhqxP%NEZ`(=`&Xh~{V)42z!FTSC<ybo*F3h<j_jZ7~
z3N!X{m9b)`<x8l+5v8db66&m8#2x;6w<**Gk0>6Eg*iL5*o7hYH#$qQ9ASeeo?bsU
z1l_uHqT?>|cw+yzg@Uq`Z&~-;s~R2Vh~adSynaxonn)pESqu6u<5|*dO}SuPY<_P?
znr3J(i9;-^rmOnP*^G+hsfPzDr{twmWt=PUWlovhE`dd-y!7ap{iKe^jvbpG9bkDa
ze{}Mr_u*{eD4+H(MPFPqf)X{-lV_T3W{zQo6(Ph1QD(6rv81=*4#{=P$EzOQtr2gL
z>BTmmtJ-cv4|9Y+c&WENWsxtA`Rpxy-G0LZ#qS-iI&{}M4tnJ5cC3!#KmQ^$*5!=&
z1v&-<oj4a<WBHn6ZpTMmoR97N;%<wyCYnZV@ub_B>g&j8dK%*qglq~z(bmxFTzRGH
zDTgZ!c2n`zeyp}z&3PpyL&vRT%<qx2m9$N^;&`|a+<v$;!CPWzw!+@t8Pne0V$<08
P!0ZB{f>#l(s}KJVS!;H+

literal 0
HcmV?d00001

diff --git a/webtool/templates/explorer/annotations-editor.html b/webtool/templates/explorer/annotations-editor.html
index f16c7c325..cf356f542 100644
--- a/webtool/templates/explorer/annotations-editor.html
+++ b/webtool/templates/explorer/annotations-editor.html
@@ -7,7 +7,7 @@
 		<dd>
 			<em>Label</em>
 			<button class="tooltip-trigger" aria-controls="tooltip-annotation-editor-label">?</button>
-			<p role="tooltip" id="tooltip-annotation-editor-label" aria-hidden="true">The label of the annotation field</p>
+			<p role="tooltip" id="tooltip-annotation-editor-label" aria-hidden="true">The label of the annotation field. Can't be the same as an already existing column.</p>
 		</dd>
 	</div>
 	<div>
@@ -15,7 +15,7 @@
 			<em>Input type</em>
 			<button class="tooltip-trigger" aria-controls="tooltip-annotation-editor-type">?</button>
 			<p role="tooltip" id="tooltip-annotation-editor-type" aria-hidden="true">The type of annotation field. Available types
-			include a text field, a large text field, a checkbox, and a dropdown menu.</p>
+			include a text field, a large text field, checkboxes, and a dropdown menu.</p>
 		</dd>
 	</div>
 	<div>
diff --git a/webtool/templates/explorer/datasource-templates/generic.html b/webtool/templates/explorer/datasource-templates/generic.html
new file mode 100644
index 000000000..1b342db66
--- /dev/null
+++ b/webtool/templates/explorer/datasource-templates/generic.html
@@ -0,0 +1,127 @@
+<!-- For the default template, we're registering a few common
+names for fields. The last encountered field in these lists will be used,
+so should be ordered in decreasing terms of importance. -->
+{%
+set all_known_fields = {
+	"author": ["author", "author_name", "author_fullname", "nickname"],
+	"time": ["created_utc", "timestamp", "time"],
+	"title": ["title", "subject"],
+	"body": ["body", "message"],
+	"media": ["image", "images", "image_url"],
+	"tags": ["hashtags", "tags"],
+	"views": ["views", "num_views"],
+	"likes": ["likes", "num_likes", "notes"],
+	"comments": ["comments", "num_comments", "reactions"],
+	"shares": ["shares", "num_shares"],
+	"url": ["url", "link_url", "post_url", "link"]
+}
+%}
+
+{% set fields = {} %}
+
+<!-- Post ID and thread ID are always included -->
+{% set x=fields.__setitem__("id", post["id"]) %}
+{% set x=fields.__setitem__("thread_id", post["thread_id"]) %}
+
+<!-- Set all default field names for a post, as registered above -->
+{% for field, known_fields in all_known_fields.items() %}
+	{% for known_field in known_fields %}
+		{% if known_field in post and post[known_field] %}
+			{% set x=fields.__setitem__(field, post[known_field]) %}
+		{% endif %}
+	{% endfor %}
+{% endfor %}
+
+<!-- Possible external link, if not pseudonymised -->
+{% if fields.get("url") and pseudonymised %}
+	<span class="external-url deactivated" title="External URLs unavailable for pseudonymised datasets"><i class="fas fa-external-link-alt"></i></span>
+{% elif fields.get("url") and not pseudonymised %}
+	<a href="{{ fields.url }}" target="_blank"><span class="external-url" title="Go to original post"><i class="fas fa-external-link-alt"></i></span></a>
+{% endif %}
+
+<!-- Post header -->
+<header>
+	<span title="ID" class="id">{{ fields.get("id") }}</span>
+	<span title="Thread ID" class="thread_id">{{ fields.get("thread_id") }}</span>
+	
+	<!-- Author name -->
+	{% if pseudonymised %}
+		<span title="Pseudonymous author" class="author">
+		<i class="fa fa-user-secret tooltip-trigger"></i>
+	{% else %}
+		<span title="Author" class="author">
+		{{ fields.get("author") }}
+	{% endif %}
+	</span>
+
+	<!-- Post title -->
+	{% if "title" in post and post["title"] %}
+		<span class="title">{{post.title}}</span>
+	{% endif %}
+
+	<!-- Post time -->
+	{% if fields.get("time") is integer %}
+		<span title="Date" class="datetime">{{ fields.get("time")|datetime('%Y-%m-%d %H:%M')|safe }}</span>
+	{% else %}
+		<span title="Date" class="datetime">{{ fields.get("time") }}</span>
+	{% endif %}
+
+</header>
+
+<!-- Post content-->
+<article>
+
+	<!-- Media item -->
+	{% if fields.get("media") %}
+	<!-- Split media urls if there's commas in the field -->
+	{% if "," in fields["media"] %}
+		{% set media_urls = fields["media"].split(",") %}
+	{% else %}
+		{% set media_urls = [fields.get("media")] %}
+	{% endif %}
+	<div class="post-media">
+		{% for media_url in media_urls %}
+	        <a href="{{ media_url }}" target="_blank" rel="external">
+                <img src="{{ media_url }}"></a>
+	        </a>
+		{% endfor %}
+	</div>
+	{% endif %}
+
+	<!-- Post body -->
+	<div class="post-content">
+		{{ fields.body | safe }}
+	</div>
+
+	<!-- Tags -->
+	{% if fields.get("tags") %}
+		<div class="tags">
+			{{ fields.tags | safe }}
+		</div>
+	{% endif %}
+
+	<!-- Metrics: views, likes, shares, comments -->
+	<div class="metrics">
+		{% if fields.get("views") %}
+			<span class="views">
+				<i class="fa-solid fa-eye"></i> {{ fields.views | commafy }}
+			</span>
+		{% endif %}
+		{% if fields.get("likes") %}
+			<span class="likes">
+				<i class="fa-solid fa-heart"></i> {{ fields.likes | commafy }}
+			</span>
+		{% endif %}
+		{% if fields.get("shares") %}
+			<span class="shares">
+				<i class="fa-solid fa-share"></i> {{ fields.shares | commafy }}
+			</span>
+		{% endif %}
+		{% if fields.get("comments") %}
+			<span class="comments">
+				<i class="fa-solid fa-comment"></i> {{ fields.comments | commafy }}
+			</span>
+		{% endif %}
+	</div>
+
+</article>
diff --git a/webtool/templates/explorer/datasource-templates/instagram.html b/webtool/templates/explorer/datasource-templates/instagram.html
new file mode 100644
index 000000000..4dcf5e048
--- /dev/null
+++ b/webtool/templates/explorer/datasource-templates/instagram.html
@@ -0,0 +1,43 @@
+<header>
+
+	<!-- Possible external link, if not pseudonymised -->
+	{% if not pseudonymised %}
+		<a href="{{ post.url }}" target="_blank"><span class="external-url" title="Go to original post"><i class="fas fa-external-link-alt"></i></span></a>
+		<span class="author"><strong><a href="https://instagram.com/{{ post.get('author') }}">{{ post.get("author") }}</strong></a>
+			{% if post.get("is_verified") %} <i class="fa-solid fa-circle-check verified"></i>{% endif %}</span>
+		<span class="time">{{ post.get("timestamp") }}</span>
+	{% else %}
+		<span title="Pseudonymous author" class="author">
+		<i class="fa fa-user-secret tooltip-trigger"></i>
+	{% endif %}
+	{% if post.get("location_name") %}
+	<div class="location">
+		<a href="https://instagram.com/explore/locations/{{ post.get('location_id') }}/{{ post.get('location_name')}}" target="_blank">{{ post.location_name }}</a>
+	</div>
+	{% endif %}
+
+</header>
+
+<div class="media-container">
+	<!-- Media item -->
+	<a href="{{ post.media_url }}" target="_blank"><img src="{{ url_for('static', filename='img/go-to-media.jpg') }}">
+	<div class="media-bullets">
+		{% for i in range(post.get("num_media", 0) | int) %}
+			•
+		{% endfor %}
+	</div>
+	</a>
+</div>
+<article class="post-content">
+	{% if post.num_likes %} 
+	<div class="likes"><strong>{{ post.get("num_likes") | commafy }} likes</strong></div>
+	{% endif %}
+	<div class="body">{{ post.get("body") }}</div>
+	{% if post.num_comments %}
+		{% if pseudonymised %}
+		<div class="comments">{{ post.get("num_comments") | commafy }} comments</span>
+		{% else %}
+		<div class="comments"><a href="{{ post.url }}/comments" target="_blank">View all {{ post.get("num_comments") | commafy }} comments</a></span>
+		{% endif %}
+	{% endif %}
+</article>
\ No newline at end of file
diff --git a/webtool/templates/explorer/datasource-templates/twitter.html b/webtool/templates/explorer/datasource-templates/twitter.html
new file mode 100644
index 000000000..2760a7941
--- /dev/null
+++ b/webtool/templates/explorer/datasource-templates/twitter.html
@@ -0,0 +1,53 @@
+<div class="post-table">
+	<div class="post-table-row">
+		<div class="profile-picture">
+			{% if not pseudonymised %}
+			<img src="{{ post.get('author_profile_img') }}">
+			{% else %}
+			<span title="Pseudonymous author" class="author"><i class="fa fa-user-secret tooltip-trigger"></i></span>
+			{% endif %}
+		</div>
+		<div class="post-content">
+			<header>
+				{% if not pseudonymised %}
+					<span class="author"><strong><a href="https://twitter.com/{{ post.get('author') }}">{{ post.get("author_fullname")}}</strong> {% if post.get("verified") %} <i class="fa-solid fa-circle-check verified"></i> {% endif %}<span class="atname">@{{ post.get("author") }}</span></a>
+						</span>
+				{% else %}
+					<span title="Pseudonymous author" class="author"><i class="fa fa-user-secret tooltip-trigger"></i></span>
+				{% endif %}
+					<span class="time">{{ post.get("timestamp") }}</span>
+
+			</header>
+
+			<div class="body">{{ post.body }}</div>
+			<!-- Media item -->
+			{% if post.get("images") %}
+				{% set media_url = post.get("images") %}
+			{% elif post.get("videos") %}
+				{% set media_url = post.get("videos") %}
+			{% endif %}
+			{% if media_url %}
+			<div class="media-container">
+				{% for url in media_url.split(",") %}
+					<a href="{{ url }}" target="_blank"><img src="{{ url }}"></a>
+				{% endfor %}
+			</div>
+			{% endif %}
+			{% if post.get("quoted_user") %}
+			<div class="quoted-user">
+				Tweet by @{{ post.quoted_user }}
+			</div>
+			{% endif %}
+			<div class="metrics">
+				<span class="comments"><i class="fa-solid fa-comment"></i> {{ post.get("reply_count") | numberify }}</span>
+				<span class="retweets"><i class="fa-solid fa-retweet"></i> {{ post.get("retweet_count") }}</span>
+				<span class="likes"><i class="fa-solid fa-heart"></i> {{ post.get("like_count") | numberify }}</span>
+				{% if post.get("impression_count") %}<span class="impressions"><i class="fa-solid fa-chart-simple"></i> {{ post.get("impression_count") | numberify }}</span>{% endif %}
+			</div>
+		</div>
+	<!-- Possible external link, if not pseudonymised -->
+	{% if not pseudonymised %}
+		<a href="https://twitter.com/{{ post.author }}/status/{{ post.id }}" target="_blank"><span class="external-url" title="Go to original post"><i class="fas fa-external-link-alt"></i></span></a>
+	{% endif %}
+	</div>
+</div>
\ No newline at end of file
diff --git a/webtool/templates/explorer/explorer.html b/webtool/templates/explorer/explorer.html
index 28d4ec05c..9f10c3cae 100644
--- a/webtool/templates/explorer/explorer.html
+++ b/webtool/templates/explorer/explorer.html
@@ -21,7 +21,7 @@
 <!-- Page numbers -->
 {% include "explorer/pagination.html" %}
 
-<!-- CSS: Use generic template, data source preset, or custom. -->
+<!-- CSS: Use generic or data source-specific template. -->
 <!-- Use a nested class so the style is only applied to posts. -->
 <style type="text/css">
 .explorer-content-container {
diff --git a/webtool/templates/explorer/post.html b/webtool/templates/explorer/post.html
index 557647f34..52b045886 100644
--- a/webtool/templates/explorer/post.html
+++ b/webtool/templates/explorer/post.html
@@ -1,15 +1,15 @@
 <li id="post-{{ post.id }}" class="post{% if post.thread_id == post.id %} op{% endif %}">    
 
 	<!-- Don't show certain data if the dataset is pseudonymised -->
-	{% set pseudonymised = True if parameters and ('pseudonymise' in parameters and parameters.get('pseudonymise')) else False %}
+	{% set pseudonymised = True if dataset.parameters and dataset.parameters.get('pseudonymise', False) %}
 	
 	<!-- Data sources may have a custom template (Jinja2) -->
-	{% if template == "preset" %}
-		{% include "explorer/datasource_templates/" + datasource + ".html" %}
+	{% if template == "datasource" %}
+		{% include "explorer/datasource-templates/" + datasource + ".html" %}
 
-	<!-- Else, use a default template that shows common fields -->
+	<!-- Else, use a generic template that shows common fields -->
 	{% else %}
-		{% include "explorer/datasource_templates/default.html" %}
+		{% include "explorer/datasource-templates/generic.html" %}
 		
 	{% endif %}
 
diff --git a/webtool/views/views_explorer.py b/webtool/views/views_explorer.py
index 4228c0aa7..d6aefba31 100644
--- a/webtool/views/views_explorer.py
+++ b/webtool/views/views_explorer.py
@@ -130,18 +130,15 @@ def explorer_dataset(key, page=1):
 	if not posts:
 		return error(404, error="No posts available for this datasource")
 
-	# We can use either a generic or a pre-made template 
-	# for Explorer posts. Get the choice as set in the config,
-	# and if it's a preset data source template , verify it exists.
-	template = datasource_config.get("template", "general")
-	if template == "preset":
-		template_path = Path(config.get('PATH_ROOT'), "webtool/templates/explorer/datasource_templates/" + datasource + ".html")
-		if not template_path.exists():
-			template = "general"
-			warning += "No preset template found for this data source. Using the general template instead."
-
-	# Include CSS: a generic template, a data source preset, or custom.
-	posts_css = get_css(datasource, css_type=datasource_config.get("css", "general"))
+	# We can use either a generic or a pre-made data source-specific template.
+	template = "datasource" if has_datasource_template(datasource) else "generic"
+	if template == "generic":
+		posts_css = Path(config.get('PATH_ROOT'), "webtool/static/css/explorer/generic.css")
+	else:
+		posts_css = Path(config.get('PATH_ROOT'), "webtool/static/css/explorer/" + datasource + ".css")
+	# Read CSS and pass as a string
+	with open(posts_css, "r", encoding="utf-8") as css:
+		posts_css = css.read()
 
 	# Check whether there's already annotations inserted already.
 	# If so, also pass these to the template.
@@ -152,7 +149,7 @@ def explorer_dataset(key, page=1):
 		annotations = json.loads(annotations["annotations"])
 	
 	# Generate the HTML page
-	return render_template("explorer/explorer.html", dataset=dataset, datasource=datasource, has_database=has_database, posts=posts, annotation_fields=annotation_fields, annotations=annotations, posts_css=posts_css, template=template, page=page, offset=offset, posts_per_page=posts_per_page, post_count=post_count, max_posts=max_posts, warning=warning)
+	return render_template("explorer/explorer.html", dataset=dataset, datasource=datasource, has_database=has_database, posts=posts, annotation_fields=annotation_fields, annotations=annotations, template=template, posts_css=posts_css, page=page, offset=offset, posts_per_page=posts_per_page, post_count=post_count, max_posts=max_posts, warning=warning)
 
 @app.route('/results/<datasource>/<string:thread_id>/explorer')
 @api_ratelimit
@@ -191,11 +188,7 @@ def explorer_database_thread(datasource, board, thread_id):
 	posts = [strip_html(post) for post in posts]
 	posts = [format(post, datasource=datasource) for post in posts]
 
-	# Include custom fields if it they are in the datasource's 'explorer' dir.
-	# The file's naming format should e.g. be 'reddit-explorer.json'.
-	custom_fields = get_custom_fields(datasource)
-
-	return render_template("explorer/explorer.html", datasource=datasource, board=board, posts=posts, datasource_config=datasource_config, custom_fields=custom_fields, posts_per_page=len(posts), post_count=len(posts), thread=thread_id, max_posts=max_posts)
+	return render_template("explorer/explorer.html", datasource=datasource, board=board, posts=posts, datasource_config=datasource_config, posts_per_page=len(posts), post_count=len(posts), thread=thread_id, max_posts=max_posts)
 
 @app.route('/explorer/post/<datasource>/<board>/<string:post_id>')
 @api_ratelimit
@@ -228,11 +221,7 @@ def explorer_database_posts(datasource, board, thread_id):
 	posts = [strip_html(post) for post in posts]
 	posts = [format(post) for post in posts]
 
-	# Include custom fields if it they are in the datasource's 'explorer' dir.
-	# The file's naming format should e.g. be 'reddit-explorer.json'.
-	custom_fields = get_custom_fields(datasource)
-
-	return render_template("explorer/explorer.html", datasource=datasource, board=board, posts=posts, datasource_config=datasource_config, custom_fields=custom_fields, posts_per_page=len(posts), post_count=len(posts))
+	return render_template("explorer/explorer.html", datasource=datasource, board=board, posts=posts, datasource_config=datasource_config, posts_per_page=len(posts), post_count=len(posts))
 
 @app.route("/explorer/save_annotation_fields/<string:key>", methods=["POST"])
 @api_ratelimit
@@ -555,30 +544,19 @@ def get_database_posts(db, datasource, ids, board="", threads=False, limit=0, of
 
 	return posts
 
-def get_css(datasource, css_type):
+def has_datasource_template(datasource):
 	"""
-	Check if there's a custom css file for this dataset.
-	If so, return the text.
-	Custom css files should be placed in the webtool/static/css/explorer/ folder with the name of the datasource (e.g. 'webtool/static/css/explorer/reddit.css').
-
-	:param datasource, str:	Datasource name
-	:param css_type, str:		`general` or `preset`.
-
-	:return: The css as string.
+	Check if the data source has a data source-specific template.
+	This requires HTML and CSS files.
+	Custom HTML files should be placed in `webtool/templates/explorer/datasource-templates/<datasource name>.html`.
+	Custom CSS files should be placed in `webtool/static/css/explorer/<datasource name>.css`.
 
+	:param datasource, str:	Datasource name.
+	:return: bool, Whether the required files are present.
 	"""
+	css_exists = Path(config.get('PATH_ROOT'), "webtool/static/css/explorer/" + datasource + ".css").exists()
+	html_exists = Path(config.get('PATH_ROOT'), "webtool/templates/explorer/datasource-templates/" + datasource + ".html").exists()
 
-	if css_type == "preset":
-		css_path = Path(config.get('PATH_ROOT'), "webtool/static/css/explorer/", datasource + ".css")
-	elif css_type == "general":
-		css_path = Path(config.get('PATH_ROOT'), "webtool/static/css/explorer/default.css")
-	else:
-		return ""
-
-	if not css_path.exists():
-		return ""
-
-	with open(css_path, "r", encoding="utf-8") as css:
-		css = css.read()
-
-	return css
\ No newline at end of file
+	if css_exists and html_exists:
+		return True
+	return False
\ No newline at end of file

From 08735b76909ee4321f541daa722ac4489ab13d6c Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Tue, 23 Apr 2024 19:46:53 +0200
Subject: [PATCH 023/204] Remove prints

---
 processors/filtering/write_annotations.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/processors/filtering/write_annotations.py b/processors/filtering/write_annotations.py
index 14c5cb6c1..ad7fa0e4f 100644
--- a/processors/filtering/write_annotations.py
+++ b/processors/filtering/write_annotations.py
@@ -99,11 +99,9 @@ def process(self):
 			if post_count % 2500 == 0:
 				self.dataset.update_status("Processed %i posts" % post_count)
 				self.dataset.update_progress(post_count / self.source_dataset.num_rows)
-			print(post_count, item)
 
 		# Write to top dataset
 		for label, values in new_data.items():
-			print(label, len(values))
 			self.add_field_to_parent(label, values, which_parent=self.source_dataset, update_existing=True)
 		
 		self.dataset.update_status("Annotations written to parent dataset.")

From 759b36a7e95bf4543e73c4d8c839db521e929409 Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Tue, 23 Apr 2024 19:47:14 +0200
Subject: [PATCH 024/204] Don't prepend 'annotations'

---
 webtool/views/views_dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/webtool/views/views_dataset.py b/webtool/views/views_dataset.py
index 725a965c0..54735a3b0 100644
--- a/webtool/views/views_dataset.py
+++ b/webtool/views/views_dataset.py
@@ -201,7 +201,7 @@ def get_mapped_result(key):
     annotation_labels = None
     annotation_fields = dataset.get_annotation_fields()
     if annotation_fields:
-        annotation_labels = ["annotation_" + v["label"] for v in annotation_fields.values()]
+        annotation_labels = [v["label"] for v in annotation_fields.values()]
         annotations = dataset.get_annotations()
 
     def map_response():

From 0cf2ccd8a87baa31c02fd8f37a04a15993117ffb Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Wed, 24 Apr 2024 11:51:55 +0200
Subject: [PATCH 025/204] Don't commafy post body in generic Explorer template

---
 webtool/templates/explorer/datasource-templates/generic.html | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/webtool/templates/explorer/datasource-templates/generic.html b/webtool/templates/explorer/datasource-templates/generic.html
index 1b342db66..dbd5e5487 100644
--- a/webtool/templates/explorer/datasource-templates/generic.html
+++ b/webtool/templates/explorer/datasource-templates/generic.html
@@ -119,7 +119,7 @@
 		{% endif %}
 		{% if fields.get("comments") %}
 			<span class="comments">
-				<i class="fa-solid fa-comment"></i> {{ fields.comments | commafy }}
+				<i class="fa-solid fa-comment"></i> {{ fields.comments }}
 			</span>
 		{% endif %}
 	</div>

From bc386b35a97816f413c785ef2f158c194111c878 Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Wed, 24 Apr 2024 11:52:09 +0200
Subject: [PATCH 026/204] Leftover string in config definition

---
 common/lib/config_definition.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/common/lib/config_definition.py b/common/lib/config_definition.py
index e1568b4f8..045fe0afe 100644
--- a/common/lib/config_definition.py
+++ b/common/lib/config_definition.py
@@ -348,7 +348,6 @@
             }
         }
     },
-    "explorer"
     # Web tool settings
     # These are used by the FlaskConfig class in config.py
     # Flask may require a restart to update them

From 7bf4ed8727b15a7904f38e623344c236384901ae Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Wed, 24 Apr 2024 11:52:18 +0200
Subject: [PATCH 027/204] No user input needed for 9GAG

---
 datasources/ninegag/search_9gag.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/datasources/ninegag/search_9gag.py b/datasources/ninegag/search_9gag.py
index 4d3768361..973de82ba 100644
--- a/datasources/ninegag/search_9gag.py
+++ b/datasources/ninegag/search_9gag.py
@@ -8,7 +8,6 @@
 
 from backend.lib.search import Search
 from common.lib.item_mapping import MappedItem
-from common.lib.helpers import UserInput
 
 
 class SearchNineGag(Search):

From 4c66f4197611120c7c8e7f4c1c1b386271155686 Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Wed, 24 Apr 2024 11:52:27 +0200
Subject: [PATCH 028/204] Remove old files

---
 .../datasource_templates/default.html         | 125 ------------------
 .../datasource_templates/instagram.html       |  36 -----
 2 files changed, 161 deletions(-)
 delete mode 100644 webtool/templates/explorer/datasource_templates/default.html
 delete mode 100644 webtool/templates/explorer/datasource_templates/instagram.html

diff --git a/webtool/templates/explorer/datasource_templates/default.html b/webtool/templates/explorer/datasource_templates/default.html
deleted file mode 100644
index f6ee0b2ad..000000000
--- a/webtool/templates/explorer/datasource_templates/default.html
+++ /dev/null
@@ -1,125 +0,0 @@
-<!-- For the default template, we're registering a few common
-names for fields. The last encountered field in these lists will be used,
-so should be ordered in decreasing terms of importance. -->
-{%
-set all_known_fields = {
-	"author": ["author", "author_name", "author_fullname", "nickname"],
-	"time": ["created_utc", "timestamp", "time"],
-	"title": ["title", "subject"],
-	"body": ["body", "message"],
-	"media": ["image", "images", "image_url"],
-	"tags": ["hashtags", "tags"],
-	"views": ["views", "num_views"],
-	"likes": ["likes", "num_likes", "notes"],
-	"comments": ["comments", "num_comments", "reactions"],
-	"shares": ["shares", "num_shares"],
-	"url": ["url", "link_url", "post_url"]
-}
-%}
-
-{% set fields = {} %}
-
-<!-- Post ID and thread ID are always included -->
-{% set x=fields.__setitem__("id", post["id"]) %}
-{% set x=fields.__setitem__("thread_id", post["thread_id"]) %}
-
-<!-- Set all default field names for a post, as registered above -->
-{% for field, known_fields in all_known_fields.items() %}
-	{% for known_field in known_fields %}
-		{% if known_field in post and post[known_field] %}
-			{% set x=fields.__setitem__(field, post[known_field]) %}
-		{% endif %}
-	{% endfor %}
-{% endfor %}
-
-<!-- Possible external link, if not pseudonymised -->
-{% if "url" in fields and pseudonymised %}
-	<span class="external-url deactivated" title="External URLs unavailable for pseudonymised datasets"><i class="fas fa-external-link-alt"></i></span>
-{% elif "url" in fields and not pseudonymised %}
-	<a href="{{ fields.url }}" target="_blank"><span class="external-url" title="Go to original post"><i class="fas fa-external-link-alt"></i></span></a>
-{% endif %}
-
-<!-- Post header -->
-<header>
-	<span title="ID" class="id">{{ fields.get("id") }}</span>
-	<span title="Thread ID" class="thread_id">{{ fields.get("thread_id") }}</span>
-	
-	<!-- Author name -->
-	{% if pseudonymised %}
-		<span title="Pseudonymous author" class="author">
-		<i class="fa fa-user-secret tooltip-trigger"></i>
-	{% else %}
-		<span title="Author" class="author">
-		{{ fields.get("author") }}
-	{% endif %}
-	</span>
-
-	<!-- Post title -->
-	{% if "title" in post and post["title"] %}
-		<span class="title">{{post.title}}</span>
-	{% endif %}
-
-	<!-- Post time -->
-	{% if fields.get("time") is integer %}
-		<span title="Date" class="datetime">{{ fields.get("time")|datetime('%Y-%m-%d %H:%M')|safe }}</span>
-	{% else %}
-		<span title="Date" class="datetime">{{ fields.get("time") }}</span>
-	{% endif %}
-
-</header>
-
-<!-- Post content-->
-<article>
-
-<!-- Media item -->
-{% if fields.get("media") %}
-	<div class="post-media">
-	<!-- Split media urls if there's commas in the field -->
-	{% if "," in fields["media"] %}
-		{% set media_urls = fields["media"].split(",") %}
-		{% for media_url in media_urls %}
-		<img class="gallery" src="{{ media_url }}">
-		{% endfor %}
-	{% else %}
-		<img src="{{ fields['media'] }}">
-	{% endif %}
-	</div>
-	{% endif %}
-
-	<!-- Post body -->
-	<div class="post-content">
-		{{ fields.body | safe }}
-	</div>
-
-	<!-- Tags -->
-	{% if fields.get("tags") %}
-		<div class="tags">
-			{{ fields.tags | safe }}
-		</div>
-	{% endif %}
-
-	<!-- Metrics: views, likes, shares, comments -->
-	<div class="metrics">
-		{% if fields.get("views") %}
-			<span class="views">
-				<i class="fa-solid fa-eye"></i> {{ fields.views | commafy }}
-			</span>
-		{% endif %}
-		{% if fields.get("likes") %}
-			<span class="likes">
-				<i class="fa-solid fa-heart"></i> {{ fields.likes | commafy }}
-			</span>
-		{% endif %}
-		{% if fields.get("shares") %}
-			<span class="shares">
-				<i class="fa-solid fa-share"></i> {{ fields.shares | commafy }}
-			</span>
-		{% endif %}
-		{% if fields.get("comments") %}
-			<span class="comments">
-				<i class="fa-solid fa-comment"></i> {{ fields.comments | commafy }}
-			</span>
-		{% endif %}
-	</div>
-
-</article>
diff --git a/webtool/templates/explorer/datasource_templates/instagram.html b/webtool/templates/explorer/datasource_templates/instagram.html
deleted file mode 100644
index 09f9677e2..000000000
--- a/webtool/templates/explorer/datasource_templates/instagram.html
+++ /dev/null
@@ -1,36 +0,0 @@
-<header>
-
-	<!-- Possible external link, if not pseudonymised -->
-	{% if not pseudonymised %}
-		<a href="{{ post.url }}" target="_blank"><span class="external-url" title="Go to original post"><i class="fas fa-external-link-alt"></i></span></a>
-		<span class="author"><strong><a href="https://instagram.com/{{ post.get('author') }}">{{ post.get("author") }}</strong></a>
-			{% if post.get("is_verified") %} <i class="fa-solid fa-circle-check verified"></i>{% endif %}</span>
-		<span class="time">{{ post.get("timestamp") }}</span>
-	{% else %}
-		<span title="Pseudonymous author" class="author">
-		<i class="fa fa-user-secret tooltip-trigger"></i>
-	{% endif %}
-	{% if post.get("location_name") %}
-	<div class="location">
-		<a href="https://instagram.com/explore/locations/{{ post.get('location_id') }}/{{ post.get('location_name')}}" target="_blank">{{ post.location_name }}</a>
-	</div>
-	{% endif %}
-
-</header>
-
-<div class="media-container">
-	<img src="{{ post.get('media_url') | post_media }}">
-</div>
-<article class="post-content">
-	{% if post.num_likes %} 
-	<div class="likes"><strong>{{ post.get("num_likes") | commafy }} likes</strong></div>
-	{% endif %}
-	<div class="body">{{ post.get("body") }}</div>
-	{% if post.num_comments %}
-		{% if pseudonymised %}
-		<div class="comments">{{ post.get("num_comments") | commafy }} comments</span>
-		{% else %}
-		<div class="comments"><a href="{{ post.url }}/comments" target="_blank">View all {{ post.get("num_comments") | commafy }} comments</a></span>
-		{% endif %}
-	{% endif %}
-</article>
\ No newline at end of file

From 67447363e0f37dbbb72798eabb2d9dec0e97ea76 Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Wed, 24 Apr 2024 19:12:01 +0200
Subject: [PATCH 029/204] Rudimentary TikTok template

---
 webtool/static/css/explorer/tiktok.css        | 126 ++++++++++++++++--
 webtool/static/css/explorer/twitter.css       |   2 +-
 .../datasource-templates/generic.html         |   2 +-
 .../explorer/datasource-templates/tiktok.html |  72 ++++++++++
 4 files changed, 188 insertions(+), 14 deletions(-)
 create mode 100644 webtool/templates/explorer/datasource-templates/tiktok.html

diff --git a/webtool/static/css/explorer/tiktok.css b/webtool/static/css/explorer/tiktok.css
index e295a248f..5db2a18b1 100644
--- a/webtool/static/css/explorer/tiktok.css
+++ b/webtool/static/css/explorer/tiktok.css
@@ -1,23 +1,125 @@
-body {
+.explorer-content {
+	background-color: white;
+	padding-top: 30px;
+}
+
+.posts .post {
+	font-family: Arial, sans-serif;
+	font-size: 15px;
+	width: 580px;
+	margin: 0 auto;
+	background-color: white;
+	list-style-type: none;
+	border: 1px solid #efefef;
+	border-radius: 10px;
+	min-height: 50px;
+	margin-bottom: 20px;
+}
+
+.posts .post .post-table {
+	position: relative;
+	display: table;
+	table-layout: fixed;
+}
+
+.posts .post .post-table-row {
+	display: table-row;
+}
+
+.posts .post .profile-picture {
+	display: table-cell;
+	width: 11%;
+	vertical-align: top;
+}
+
+.posts .post .profile-picture img {
+	border-radius: 100%;
+}
+
+.posts .post .post-content {
+	display: table-cell;
+	width: 93%;
+}
+
+.posts .post header {
+	color: black;
+	margin-top: 5px;
+	margin-bottom: 5px;
+}
+
+.posts .post header a {
 	color: black;
 }
 
-.posts header span.post_id {
-	display: none;
+.posts .post .post-content {
+	display: inline-block;
 }
 
-.posts li.post {
+.posts .post .post-media {
+	margin-top: 10px;
+	width: 100%;
+}
+
+.posts .post .post-media img {
+	width: 100%;
 	border-radius: 10px;
-	padding: 20px;
-	max-width: 200px;
-	background-color: white;
-	border-bottom: 1px grey;
 }
 
+.metrics {
+	display: table-cell;
+	width: 5%;
+	vertical-align: top;
+	margin-top: 40px;
+}
 
-.posts li.post .preview {
-	display: block;
-	margin: 0 auto;
-	max-width: 300px;
+.metrics span {
+	display: inline-block;
+	width: 100%;
+	margin-bottom: 20px;
+}
+
+.posts .external-url {
+	color: rgb(104, 119, 130);
+}
+
+span.hashtag {
+	color: rgb(29, 155, 240);
+}
+
+
+/** --------------------- *
+     Annotation post elements
+  * --------------------- */
+.post-annotations {
+    background-color: rgb(254, 44, 85);
+    margin-top: 10px;
+    color: white;
+    border-radius: 10px;
+}
+
+.post-annotation {
+    padding: 15px;
+}
+
+.post-annotation > .annotation-label {
+    display: inline-block;
+    vertical-align: middle;
+    text-align: right;
+    min-width: 150px;
+    margin-right: 5px;
+    line-height: 1.6em;
+    overflow-x: hidden;
+}
+
+.post-annotation.checkbox > .post-annotation-options {
+    display: inline-block;
+}
+
+.post-annotation-options {
+    display: inline-block;
+    vertical-align: top;
 }
 
+.post-annotation-options > input {
+    display: inline-block;
+}
\ No newline at end of file
diff --git a/webtool/static/css/explorer/twitter.css b/webtool/static/css/explorer/twitter.css
index 68fcd7cfd..6ad1aeeb1 100644
--- a/webtool/static/css/explorer/twitter.css
+++ b/webtool/static/css/explorer/twitter.css
@@ -97,7 +97,7 @@ span.hashtag {
      Annotation post elements
   * --------------------- */
 .post-annotations {
-    background-color: rgb(29, 155, 240);;
+    background-color: rgb(29, 155, 240);
     margin-top: 10px;
     color: white;
     border-radius: 15px;
diff --git a/webtool/templates/explorer/datasource-templates/generic.html b/webtool/templates/explorer/datasource-templates/generic.html
index dbd5e5487..ccebaf09c 100644
--- a/webtool/templates/explorer/datasource-templates/generic.html
+++ b/webtool/templates/explorer/datasource-templates/generic.html
@@ -11,7 +11,7 @@
 	"tags": ["hashtags", "tags"],
 	"views": ["views", "num_views"],
 	"likes": ["likes", "num_likes", "notes"],
-	"comments": ["comments", "num_comments", "reactions"],
+	"comments": ["num_comments", "reactions"],
 	"shares": ["shares", "num_shares"],
 	"url": ["url", "link_url", "post_url", "link"]
 }
diff --git a/webtool/templates/explorer/datasource-templates/tiktok.html b/webtool/templates/explorer/datasource-templates/tiktok.html
new file mode 100644
index 000000000..3f5044089
--- /dev/null
+++ b/webtool/templates/explorer/datasource-templates/tiktok.html
@@ -0,0 +1,72 @@
+<div class="post-table">
+	<div class="post-table-row">
+		<div class="profile-picture">
+			{% if not pseudonymised %}
+			<img src="{{ post.get('author_avatar') }}">
+			{% else %}
+			<span title="Pseudonymous author" class="author"><i class="fa fa-user-secret tooltip-trigger"></i></span>
+			{% endif %}
+		</div>
+		<!-- Post content-->
+		<div class="post-content">
+			<!-- Post header -->
+			<header>
+				<!-- Author name -->
+				{% if pseudonymised %}
+					<span title="Pseudonymous author" class="author">
+					<i class="fa fa-user-secret tooltip-trigger"></i>
+				{% else %}
+					<span title="Author" class="author">
+					<strong>{{ post.get("author") }}</strong> {{ post.get("author_full" )}}
+				{% endif %}
+				</span>
+				<!-- Post time -->
+				<span title="Date" class="datetime">{{ post.get("timestamp") }}</span>
+				<!--  -->
+				<div title="Music" class="music"><i class="fa-solid fa-music"></i> {{ post.get("music_name") }} </div>	
+				
+			</header>
+
+			<!-- Post body -->
+			<span class="body">{{ post.body | safe }}</span>
+
+			<!-- Video thumbnail item -->
+			<div class="post-media">
+				<a href="{{ post.tiktok_url }}" target="_blank"><img src="{{ url_for('static', filename='img/go-to-media.jpg') }}"></a>
+			</div>
+			
+		</div>
+
+		<!-- Metrics: views, likes, shares, comments -->
+		<div class="metrics">
+			{% if post.get("plays") %}
+			<span class="views">
+				<i class="fa-solid fa-eye"></i> {{ post["plays"] | numberify }}
+			</span>
+			{% endif %}
+			{% if post.get("likes") %}
+			<span class="likes">
+				<i class="fa-solid fa-heart"></i> {{ post["likes"] | numberify }}
+			</span>
+			{% endif %}
+			{% if post.get("shares") %}
+			<span class="shares">
+				<i class="fa-solid fa-share"></i> {{ post["shares"] | numberify }}
+			</span>
+			{% endif %}
+			{% if post.get("comments") %}
+			<span class="comments">
+				<i class="fa-solid fa-comment"></i> {{ post["comments"] | numberify }}
+			</span>
+			{% endif %}
+			<!-- Possible external link, if not pseudonymised -->
+			{% if post.get("tiktok_url") and pseudonymised %}
+			<span class="external-url deactivated" title="External URLs unavailable for pseudonymised datasets"><i class="fas fa-external-link-alt"></i></span>
+			{% elif post.get("tiktok_url") and not pseudonymised %}
+			<a href="{{ post['tiktok_url'] }}" target="_blank"><span class="external-url" title="Go to original post"><i class="fas fa-external-link-alt"></i></span></a>
+			{% endif %}
+		</div>
+	</div>
+</div>
+
+

From afa2d3a73c7ec9ed79dd499f820962c43a2eb164 Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Mon, 29 Apr 2024 17:35:00 +0200
Subject: [PATCH 030/204] Make invalid fields have a red border

---
 webtool/static/css/dataset-page.css | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/webtool/static/css/dataset-page.css b/webtool/static/css/dataset-page.css
index f65693e2d..7ffb48624 100644
--- a/webtool/static/css/dataset-page.css
+++ b/webtool/static/css/dataset-page.css
@@ -733,4 +733,8 @@ body.image-preview {
     overflow: hidden;
     box-sizing: border-box;
     border-width: 0;
+}
+
+.annotation-field-label.invalid {
+    border: 1px solid red;
 }
\ No newline at end of file

From 1716fe4af1734e7216991f9458d8df4202f13487 Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Mon, 29 Apr 2024 18:54:18 +0200
Subject: [PATCH 031/204] Make sure that annotation fields do not use existing
 column names in front- and back-end

---
 webtool/static/js/explorer.js            |  8 +++++-
 webtool/templates/explorer/explorer.html | 22 ++++++++++++---
 webtool/views/views_explorer.py          | 34 +++++++++++++-----------
 3 files changed, 44 insertions(+), 20 deletions(-)

diff --git a/webtool/static/js/explorer.js b/webtool/static/js/explorer.js
index 6cf8aed9e..efd77cacd 100644
--- a/webtool/static/js/explorer.js
+++ b/webtool/static/js/explorer.js
@@ -243,7 +243,13 @@ const annotations = {
 			}
 			// Make sure the names can't be duplicates.
 			else if (labels_added.includes(label)) {
-				warning = "Fields must be unique";
+				warning = "Field labels must be unique";
+				label_field.addClass("invalid");
+			}
+
+			// We can't add field labels that are also existing column names
+			else if (original_columns.includes(label)) {
+				warning = "Fields labels cannot be an existing column name";
 				label_field.addClass("invalid");
 			}
 
diff --git a/webtool/templates/explorer/explorer.html b/webtool/templates/explorer/explorer.html
index 9f10c3cae..7e9ae2e14 100644
--- a/webtool/templates/explorer/explorer.html
+++ b/webtool/templates/explorer/explorer.html
@@ -5,14 +5,28 @@
 
 {% block body %}
 
-<!-- Some custom behaviour here -->
+<!-- Custom JavaScript for Explorer-specific functions -->
 <script type="text/javascript" src="{{url_for('static', filename='js/explorer.js')}}"></script>
 
-<!-- Possible annotation fields -->
+<!-- Load some variables for which we need Jinja2 -->
 <script type="text/javascript">
+// Annotation fields
 var annotation_fields = {% if annotation_fields %}{{ annotation_fields | safe }}{% else %}""{% endif %}
+
+// We're getting the dataset column names to make sure that
+// new annotation fields cannot overwrite these.
+var original_columns = {{ dataset.get_columns() | safe }}
+for (a in annotation_fields) {
+	let annotation_label = annotation_fields[a]["label"]
+	if (original_columns.includes(annotation_label)) {
+		i = original_columns.indexOf(annotation_label)
+		original_columns.splice(i, 1)
+	}
+}
+
 </script>
 
+
 {% set key = dataset.data.key %}
 
 <!-- Control toolbox, with annotation editor -->
@@ -30,7 +44,9 @@
 </style>
 
 <!-- Flags for chan posts -->
-<link rel="stylesheet" type="text/css" href="{{url_for('static', filename='css/flags.css')}}">
+{% if datasource == '4chan' %}
+	<link rel="stylesheet" type="text/css" href="{{url_for('static', filename='css/flags.css')}}">
+{% endif %}
 
 <!-- The actual dataset posts -->
 <div class="explorer-content-container">
diff --git a/webtool/views/views_explorer.py b/webtool/views/views_explorer.py
index d6aefba31..ae111560c 100644
--- a/webtool/views/views_explorer.py
+++ b/webtool/views/views_explorer.py
@@ -246,34 +246,36 @@ def save_annotation_fields(key):
 	# Do some preperations
 	new_fields = request.get_json()
 	new_field_ids = set(new_fields.keys())
+	new_field_labels = [new_field["label"] for new_field in new_fields.values()]
 	text_fields = ["textarea", "text"]
 	option_fields = set()
 
 	# Get dataset info.
-	dataset = db.fetchone("SELECT key, annotation_fields FROM datasets WHERE key = %s;", (key,))
+	# Get dataset info.
+	try:
+		dataset = DataSet(key=key, db=db)
+	except DataSetException:
+		return error(404, error="Dataset not found.")
+	
 
-	if not dataset:
-		return error(404, error="Dataset not found")
+	# Get the annotation fields that were already saved to check what's changed.
+	old_fields = dataset.get_annotation_fields()
+
+	# Can't overwrite existing column names
+	old_field_labels = [l["label"] for l in old_fields.values()]
+	existing_columns = dataset.get_columns()
+	for new_field_label in new_field_labels:
+		if new_field_label not in old_field_labels and new_field_label in existing_columns:
+			return error(403, error="Can't overwrite existing column name %s" % new_field_label)
 
-	# We're saving the annotation fields as-is
+	# We're saving new annotation fields as-is
 	db.execute("UPDATE datasets SET annotation_fields = %s WHERE key = %s;", (json.dumps(new_fields), key))
 
 	# If fields and annotations were saved before, we must also check whether we need to
 	# change old annotation data, for instance when a field is deleted or its label has changed.
 
-	# Get the annotation fields that were already saved to check what's changed.
-	old_fields = dataset.get("annotation_fields")
-	if old_fields:
-		old_fields = json.loads(old_fields)
-
 	# Get the annotations
-	if old_fields:
-		annotations = db.fetchone("SELECT annotations FROM annotations WHERE key = %s;", (key,))
-		if annotations and "annotations" in annotations:
-			if not annotations["annotations"]:
-				annotations = None
-			else:
-				annotations = json.loads(annotations["annotations"])
+	annotations = dataset.get_annotations()
 
 	# If there's old fields *and* annotations saved, we need to check if we need to update stuff.
 	if old_fields and annotations:

From bdfa30814c77f9aab239dcbe8ee1308addefe3f7 Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Tue, 30 Apr 2024 15:45:20 +0200
Subject: [PATCH 032/204] Get rid of non-necessary code and libraries

---
 webtool/views/views_explorer.py | 66 +++------------------------------
 1 file changed, 6 insertions(+), 60 deletions(-)

diff --git a/webtool/views/views_explorer.py b/webtool/views/views_explorer.py
index d6aefba31..f1a5f7747 100644
--- a/webtool/views/views_explorer.py
+++ b/webtool/views/views_explorer.py
@@ -1,30 +1,22 @@
 """
-4CAT Explorer views - pages that display datasets akin to
-the 'native' appearance of the platform they were retrieved from.
+4CAT Explorer views - pages that display datasets in a legible
+format and lets users annotate the data.
 """
 
-import datetime
 import json
-import csv
 import re
-import operator
-#import markdown
-import markdown2
-
-from backend import all_modules
 
 from pathlib import Path
 
-from flask import jsonify, abort, send_file, request, render_template
+from flask import request, render_template
 from flask_login import login_required, current_user
-
 from webtool import app, db, openapi, limiter, config
-from webtool.lib.helpers import format_chan_post, error, setting_required
+from webtool.lib.helpers import error, setting_required
 from common.lib.dataset import DataSet
-from common.lib.helpers import strip_tags, convert_to_float
+from common.lib.helpers import convert_to_float
 from common.lib.exceptions import DataSetException
-
 from common.config_manager import ConfigWrapper
+
 config = ConfigWrapper(config, user=current_user, request=request)
 api_ratelimit = limiter.shared_limit("45 per minute", scope="api")
 
@@ -445,52 +437,6 @@ def save_annotations(key):
 
 	return "success"
 
-@app.route('/api/<datasource>/boards.json')
-@api_ratelimit
-@login_required
-@setting_required("privileges.can_use_explorer")
-@openapi.endpoint("data")
-def get_boards(datasource):
-	"""
-	Get available boards in datasource
-
-	:param datasource:  The datasource for which to acquire the list of available
-					  boards.
-	:return:  A list containing a list of `boards`, as string IDs.
-
-	:return-schema: {type=object,properties={
-		boards={type=array,items={type=object,properties={
-			board={type=string}
-		}}}
-	}}
-
-	:return-error 404: If the datasource does not exist.
-	"""
-	if datasource not in config.get('datasources.enabled'):
-		return error(404, error="Invalid data source")
-
-	boards = db.fetchall("SELECT DISTINCT board FROM threads_" + datasource)
-	return jsonify({"boards": [{"board": board["board"]} for board in boards]})
-
-@app.route('/api/image/<img_file>')
-@app.route('/api/imagefile/<img_file>')
-@login_required
-@setting_required("privileges.can_use_explorer")
-def get_image_file(img_file):
-	"""
-	Returns an image based on filename
-	Request should hex the md5 hashes first (e.g. with hexdigest())
-
-	"""
-	if not re.match(r"([a-zA-Z0-9]+)\.([a-z]+)", img_file):
-		abort(404)
-
-	image_path = Path(config.get('PATH_ROOT'), config.get('PATH_IMAGES'), img_file)
-	if not image_path.exists():
-		abort(404)
-
-	return send_file(str(image_path))
-
 def sort_and_iterate_items(dataset, sort=None, reverse=False, **kwargs):
 	"""
 	Loop through both csv and NDJSON files.

From a7f73754419dd720e587bee8788447669c994da5 Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Tue, 30 Apr 2024 18:29:37 +0200
Subject: [PATCH 033/204] Move save annotation functions to dataset.py

---
 common/lib/dataset.py           | 182 ++++++++++++++++++++++++-
 webtool/views/views_explorer.py | 228 +++++---------------------------
 2 files changed, 214 insertions(+), 196 deletions(-)

diff --git a/common/lib/dataset.py b/common/lib/dataset.py
index 03dc8610d..9ca56c264 100644
--- a/common/lib/dataset.py
+++ b/common/lib/dataset.py
@@ -844,6 +844,186 @@ def get_annotations(self):
 		else:
 			return None
 
+	def save_annotation_fields(self, annotation_fields):
+		"""
+		Save the annotation fields of a dataset to the datasets table.
+		If changes to the annotation fields affect existing annotations,
+		this function also updates or deleted those values.
+
+		:param dict annotation_fields:  Annotation fields, with a field ID as key
+		:return int:					The number of annotation fields saved.
+		"""
+
+		# Do some preparations
+		new_field_ids = set(annotation_fields.keys())
+		text_fields = ["textarea", "text"]
+		option_fields = set()
+
+		# Get existing annotation fields.
+		old_fields = self.get_annotation_fields()
+
+		# We're saving the new annotation fields as-is
+		self.db.execute("UPDATE datasets SET annotation_fields = %s WHERE key = %s;", (json.dumps(annotation_fields), self.top_parent().key))
+
+		# If new annotation fields change the annotations already saved (e.g. if a field is deleted),
+		# we must also check if we should update annotation data.
+		# This can get quite complex!
+		if old_fields:
+			annotations = self.get_annotations()
+
+		if old_fields and annotations:
+
+			fields_to_delete = set()
+			labels_to_update = {}
+			options_to_delete = set()
+			options_to_update = {}
+
+			for field_id, field in old_fields.items():
+
+				# We'll delete all prior annotations for a field if its input field is deleted
+				if field_id not in new_field_ids:
+
+					# Labels are used as keys in the annotations table
+					# They should already be unique, so that's okay.
+					fields_to_delete.add(field["label"])
+					continue
+
+				# If the type has changed, also delete prior references (except between text and textarea)
+				new_type = new_fields[field_id]["type"]
+				if field["type"] != new_type:
+
+					if not field["type"] in text_fields and not new_type in text_fields:
+						fields_to_delete.add(field["label"])
+						continue
+
+				# If the label has changed, change it in the old annotations
+				old_label = old_fields[field_id]["label"]
+				new_label = new_fields[field_id]["label"]
+
+				if old_label != new_label:
+					labels_to_update[old_label] = new_label
+
+				# Check if the options for dropdowns or checkboxes have changed
+				if new_type == "checkbox" or new_type == "dropdown":
+
+					if "options" in old_fields[field_id]:
+
+						option_fields.add(old_fields[field_id]["label"])
+						new_options = new_fields[field_id]["options"]
+
+						new_ids = [list(v.keys())[0] for v in new_options]
+						new_ids = [list(v.keys())[0] for v in new_options]
+
+						# If it's a dropdown or checkbox..
+						for option in old_fields[field_id]["options"]:
+							option_id = list(option.keys())[0]
+							option_label = list(option.values())[0]
+
+							# If this ID is not present anymore, delete it
+							if option_id not in new_ids:
+								options_to_delete.add(option_label)
+								continue
+
+							# Change the label if it has changed. Bit ugly but it works.
+							new_label = [list(new_option.values())[0] for i, new_option in enumerate(new_options) if list(new_options[i].keys())[0] == option_id][0]
+
+							if option_label != new_label:
+								options_to_update[option_label] = new_label
+
+			# Loop through the old annotations if things need to be changed
+			if fields_to_delete or labels_to_update or options_to_update or options_to_delete:
+
+				for post_id in list(annotations.keys()):
+
+					for field_label in list(annotations[post_id].keys()):
+
+						# Delete the field entirely
+						if field_label in fields_to_delete:
+							del annotations[post_id][field_label]
+							continue
+
+						# Update the label
+						if field_label in labels_to_update:
+							annotations[post_id][labels_to_update[field_label]] = annotations[post_id].pop(field_label)
+							field_label = labels_to_update[field_label]
+
+						# Update or delete option values
+						if field_label in option_fields:
+							options_inserted = annotations[post_id][field_label]
+
+							# We can just delete/change the entire annotation if its a string
+							if type(options_inserted) == str:
+
+								# Delete the option if it's not present anymore
+								if options_inserted in options_to_delete:
+									del annotations[post_id][field_label]
+
+								# Update the option label if it has changed
+								elif options_inserted in options_to_update:
+									annotations[post_id][field_label] = options_to_update[options_inserted]
+
+							# For lists (i.e. checkboxes), we have to loop
+							elif type(options_inserted) == list:
+
+								for option_inserted in options_inserted:
+
+									# Delete the option if it's not present anymore
+									if option_inserted in options_to_delete:
+										annotations[post_id][field_label].remove(option_inserted)
+
+									# Update the option label if it has changed
+									elif option_inserted in options_to_update:
+										annotations[post_id][field_label] = options_to_update[option_inserted]
+
+					# Delete entire post dict if there's nothing left
+					if not annotations[post_id]:
+						del annotations[post_id]
+
+				# Save annotations as an empty string if there's none.
+				if not annotations:
+					annotations = ""
+				
+				# Save to the annotations table.
+				self.save_annotations(annotations)
+
+		return len(annotation_fields)
+
+	def save_annotations(self, annotations):
+		"""
+		Saves annotations for a dataset to the annotations table.
+
+		:param dict annotations:	Annotations dict, with post IDs as keys.	
+		:return int:				The number of posts with annotations.
+
+		"""
+
+		# If there were already annotations added, we need to make sure
+		# we're not incorrectly overwriting existing ones.
+		# We also need to check whether any of the input fields has changed.
+		# If so, we're gonna edit or remove their old values.
+		old_annotations = self.get_annotations()
+
+		if old_annotations:
+			# Loop through all new annotations and add/overwrite them
+			# with the old annotations dict.
+			for post_id in list(annotations.keys()):
+				old_annotations[post_id] = annotations[post_id]
+
+				# Empty lists/dicts get removed
+				if not old_annotations[post_id]:
+					del old_annotations[post_id]
+
+			annotations = old_annotations
+
+		if not annotations:
+			return 0
+
+		# We're saving all annotations as a JSON string
+		annotations = json.dumps(annotations)
+		self.db.execute("INSERT INTO annotations(key, annotations) VALUES(%s, %s) ON CONFLICT (key) DO UPDATE SET annotations = %s ", (self.top_parent().key, annotations, annotations))
+
+		return len(annotations)
+
 	def update_label(self, label):
 		"""
 		Update label for this dataset
@@ -1320,7 +1500,7 @@ def get_place_in_queue(self, update=False):
 		Determine dataset's position in queue
 
 		If the dataset is already finished, the position is -1. Else, the
-		position is the amount of datasets to be completed before this one will
+		position is the number of datasets to be completed before this one will
 		be processed. A position of 0 would mean that the dataset is currently
 		being executed, or that the backend is not running.
 
diff --git a/webtool/views/views_explorer.py b/webtool/views/views_explorer.py
index f1a5f7747..018f1301b 100644
--- a/webtool/views/views_explorer.py
+++ b/webtool/views/views_explorer.py
@@ -148,8 +148,10 @@ def explorer_dataset(key, page=1):
 @login_required
 @setting_required("privileges.can_use_explorer")
 @openapi.endpoint("explorer")
-def explorer_database_thread(datasource, board, thread_id):
+def explorer_api_thread(datasource, thread_id):
 	"""
+	/// INTEGRATE LATER!
+
 	Show a thread from an API-accessible database.
 
 	:param str datasource:  Data source ID
@@ -163,8 +165,6 @@ def explorer_database_thread(datasource, board, thread_id):
 		return error(404, error="No datasource provided")
 	if datasource not in config.get('datasources.enabled'):
 		return error(404, error="Invalid data source")
-	if not board:
-		return error(404, error="No board provided")
 	if not thread_id:
 		return error(404, error="No thread ID provided")
 
@@ -172,7 +172,7 @@ def explorer_database_thread(datasource, board, thread_id):
 	max_posts = config.get('explorer.max_posts', 500000)
 
 	# Get the posts with this thread ID.
-	posts = get_local_posts(db, datasource, board=board, ids=tuple([thread_id]), threads=True, order_by=["id"])
+	posts = get_local_posts(db, datasource, ids=tuple([thread_id]), threads=True, order_by=["id"])
 
 	if not posts:
 		return error(404, error="No posts available for this thread")
@@ -180,20 +180,22 @@ def explorer_database_thread(datasource, board, thread_id):
 	posts = [strip_html(post) for post in posts]
 	posts = [format(post, datasource=datasource) for post in posts]
 
-	return render_template("explorer/explorer.html", datasource=datasource, board=board, posts=posts, datasource_config=datasource_config, posts_per_page=len(posts), post_count=len(posts), thread=thread_id, max_posts=max_posts)
+	return render_template("explorer/explorer.html", datasource=datasource, posts=posts, datasource_config=datasource_config, posts_per_page=len(posts), post_count=len(posts), thread=thread_id, max_posts=max_posts)
 
 @app.route('/explorer/post/<datasource>/<board>/<string:post_id>')
 @api_ratelimit
 @login_required
 @setting_required("privileges.can_use_explorer")
 @openapi.endpoint("explorer")
-def explorer_database_posts(datasource, board, thread_id):
+def explorer_api_posts(datasource, post_ids):
 	"""
+	/// INTEGRATE LATER
+
 	Show posts from an API-accessible database.
 
 	:param str datasource:  Data source ID
 	:param str board:  Board name
-	:param int thread_id:  Thread ID
+	:param int post_ids:  Post IDs
 
 	:return-error 404:  If the thread ID does not exist for the given data source.
 	"""
@@ -202,13 +204,11 @@ def explorer_database_posts(datasource, board, thread_id):
 		return error(404, error="No datasource provided")
 	if datasource not in config.get('datasources.enabled'):
 		return error(404, error="Invalid data source")
-	if not board:
-		return error(404, error="No board provided")
-	if not thread_id:
+	if not post_ids:
 		return error(404, error="No thread ID provided")
 
 	# Get the posts with this thread ID.
-	posts = get_database_posts(db, datasource, board=board, ids=tuple([thread_id]), threads=True, order_by=["id"])
+	posts = get_database_posts(db, datasource, board=board, ids=tuple([post_ids]), threads=True, order_by=["id"])
 
 	posts = [strip_html(post) for post in posts]
 	posts = [format(post) for post in posts]
@@ -221,169 +221,27 @@ def explorer_database_posts(datasource, board, thread_id):
 @setting_required("privileges.can_run_processors")
 @setting_required("privileges.can_use_explorer")
 @openapi.endpoint("explorer")
-def save_annotation_fields(key):
+def explorer_save_annotation_fields(key):
 	"""
-	Save the annotation fields of a dataset to the datasets table.
-	If the changes to the annotation fields affect existing annotations,
-	this function also updates or deleted those old values.
+	Save teh annotation fields of a dataset to the datasets table.
 
-	:param str key:  The dataset key
+	:param str key:  	The dataset key.
 
 	:return-error 404:  If the dataset ID does not exist.
+	:return int:		The number of annotation fields saved.
 	"""
 
+	# Get dataset.
 	if not key:
 		return error(404, error="No dataset key provided")
+	try:
+		dataset = DataSet(key=key, db=db)
+	except DataSetException:
+		return error(404, error="Dataset not found.")
 
-	# Do some preperations
-	new_fields = request.get_json()
-	new_field_ids = set(new_fields.keys())
-	text_fields = ["textarea", "text"]
-	option_fields = set()
-
-	# Get dataset info.
-	dataset = db.fetchone("SELECT key, annotation_fields FROM datasets WHERE key = %s;", (key,))
-
-	if not dataset:
-		return error(404, error="Dataset not found")
-
-	# We're saving the annotation fields as-is
-	db.execute("UPDATE datasets SET annotation_fields = %s WHERE key = %s;", (json.dumps(new_fields), key))
-
-	# If fields and annotations were saved before, we must also check whether we need to
-	# change old annotation data, for instance when a field is deleted or its label has changed.
-
-	# Get the annotation fields that were already saved to check what's changed.
-	old_fields = dataset.get("annotation_fields")
-	if old_fields:
-		old_fields = json.loads(old_fields)
-
-	# Get the annotations
-	if old_fields:
-		annotations = db.fetchone("SELECT annotations FROM annotations WHERE key = %s;", (key,))
-		if annotations and "annotations" in annotations:
-			if not annotations["annotations"]:
-				annotations = None
-			else:
-				annotations = json.loads(annotations["annotations"])
-
-	# If there's old fields *and* annotations saved, we need to check if we need to update stuff.
-	if old_fields and annotations:
-
-		fields_to_delete = set()
-		labels_to_update = {}
-		options_to_delete = set()
-		options_to_update = {}
-
-		for field_id, field in old_fields.items():
-
-			# We'll delete all prior annotations for a field if its input field is deleted
-			if field_id not in new_field_ids:
-
-				# Labels are used as keys in the annotations table
-				# They should already be unique, so that's okay.
-				fields_to_delete.add(field["label"])
-				continue
-
-			# If the type has changed, also delete prior references (except between text and textarea)
-			new_type = new_fields[field_id]["type"]
-			if field["type"] != new_type:
-
-				if not field["type"] in text_fields and not new_type in text_fields:
-					fields_to_delete.add(field["label"])
-					continue
-
-			# If the label has changed, change it in the old annotations
-			old_label = old_fields[field_id]["label"]
-			new_label = new_fields[field_id]["label"]
-
-			if old_label != new_label:
-				labels_to_update[old_label] = new_label
-
-			# Check if the options for dropdowns or checkboxes have changed
-			if new_type == "checkbox" or new_type == "dropdown":
-
-				if "options" in old_fields[field_id]:
-
-					option_fields.add(old_fields[field_id]["label"])
-					new_options = new_fields[field_id]["options"]
-
-					new_ids = [list(v.keys())[0] for v in new_options]
-					new_ids = [list(v.keys())[0] for v in new_options]
-
-					# If it's a dropdown or checkbox..
-					for option in old_fields[field_id]["options"]:
-						option_id = list(option.keys())[0]
-						option_label = list(option.values())[0]
-
-						# If this ID is not present anymore, delete it
-						if option_id not in new_ids:
-							options_to_delete.add(option_label)
-							continue
-
-						# Change the label if it has changed. Bit ugly but it works.
-						new_label = [list(new_option.values())[0] for i, new_option in enumerate(new_options) if list(new_options[i].keys())[0] == option_id][0]
-
-						if option_label != new_label:
-							options_to_update[option_label] = new_label
-
-		# Loop through the old annotations if things need to be changed
-		if fields_to_delete or labels_to_update or options_to_update or options_to_delete:
-
-			for post_id in list(annotations.keys()):
-
-				for field_label in list(annotations[post_id].keys()):
-
-					# Delete the field entirely
-					if field_label in fields_to_delete:
-						del annotations[post_id][field_label]
-						continue
-
-					# Update the label
-					if field_label in labels_to_update:
-						annotations[post_id][labels_to_update[field_label]] = annotations[post_id].pop(field_label)
-						field_label = labels_to_update[field_label]
-
-					# Update or delete option values
-					if field_label in option_fields:
-						options_inserted = annotations[post_id][field_label]
-
-						# We can just delete/change the entire annotation if its a string
-						if type(options_inserted) == str:
-
-							# Delete the option if it's not present anymore
-							if options_inserted in options_to_delete:
-								del annotations[post_id][field_label]
-
-							# Update the option label if it has changed
-							elif options_inserted in options_to_update:
-								annotations[post_id][field_label] = options_to_update[options_inserted]
-
-						# For lists (i.e. checkboxes), we have to loop
-						elif type(options_inserted) == list:
-
-							for option_inserted in options_inserted:
-
-								# Delete the option if it's not present anymore
-								if option_inserted in options_to_delete:
-									annotations[post_id][field_label].remove(option_inserted)
-
-								# Update the option label if it has changed
-								elif option_inserted in options_to_update:
-									annotations[post_id][field_label] = options_to_update[option_inserted]
-
-				# Delete entire post dict if there's nothing left
-				if not annotations[post_id]:
-					del annotations[post_id]
-
-			# Save annotations as an empty string if there's none.
-			if not annotations:
-				annotations = ""
-			else:
-				annotations = json.dumps(annotations)
-
-			# Insert into the annotations table.
-			db.execute("INSERT INTO annotations(key, annotations) VALUES(%s, %s) ON CONFLICT (key) DO UPDATE SET annotations = %s ", (key, annotations, annotations))
+	# Save it!
+	annotation_fields = request.get_json()
+	dataset.save_annotation_fields(annotation_fields)
 
 	return "success"
 
@@ -393,47 +251,27 @@ def save_annotation_fields(key):
 @setting_required("privileges.can_run_processors")
 @setting_required("privileges.can_use_explorer")
 @openapi.endpoint("explorer")
-def save_annotations(key):
+def explorer_save_annotations(key):
 	"""
 	Save the annotations of a dataset to the annotations table.
 
-	:param str key:  The dataset key
+	:param str key: 	The dataset key.
 
 	:return-error 404:  If the dataset ID does not exist.
+	:return int:		The number of posts with annotations saved.
 	"""
 
+	# Get dataset.
 	if not key:
 		return error(404, error="No dataset key provided")
+	try:
+		dataset = DataSet(key=key, db=db)
+	except DataSetException:
+		return error(404, error="Dataset not found.")
 
+	# Save it!
 	new_annotations = request.get_json()
-
-	# If there were already annotations added, we need to make sure
-	# we're not incorrectly overwriting any.
-	# We also need to check whether any of the input fields have changed.
-	# If so, we're gonna edit or remove their old values.
-	old_annotations = db.fetchone("SELECT annotations FROM annotations WHERE key = %s;", (key,))
-
-	if old_annotations:
-
-		if "annotations" in old_annotations and old_annotations["annotations"]:
-			old_annotations = json.loads(old_annotations["annotations"])
-
-			# Loop through all new annotations and add/overwrite them
-			# with the old annotations dict.
-			for post_id in list(new_annotations.keys()):
-				old_annotations[post_id] = new_annotations[post_id]
-				if not old_annotations[post_id]:
-					del old_annotations[post_id]
-
-			new_annotations = old_annotations
-
-	if not new_annotations:
-		new_annotations = ""
-	else:
-		new_annotations = json.dumps(new_annotations)
-
-	# We're saving all annotations as a JSON string in one go
-	db.execute("INSERT INTO annotations(key, annotations) VALUES(%s, %s) ON CONFLICT (key) DO UPDATE SET annotations = %s ", (key, new_annotations, new_annotations))
+	dataset.save_annotations(new_annotations)
 
 	return "success"
 

From fcad68beb8b5ccd8081c3ef41fcefcb1d0ab2abc Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Tue, 30 Apr 2024 19:55:12 +0200
Subject: [PATCH 034/204] Add 'social_mediafy' template filter to add links to
 URLs, hashtags, and @-mentions

---
 webtool/lib/template_filters.py | 91 +++++++++++++++------------------
 1 file changed, 42 insertions(+), 49 deletions(-)

diff --git a/webtool/lib/template_filters.py b/webtool/lib/template_filters.py
index 14b2ec523..35fdfd03d 100644
--- a/webtool/lib/template_filters.py
+++ b/webtool/lib/template_filters.py
@@ -15,6 +15,7 @@
 from pathlib import Path
 from flask import request
 from flask_login import current_user
+from ural import urls_from_text
 
 @app.template_filter('datetime')
 def _jinja2_filter_datetime(date, fmt=None, wrap=True):
@@ -176,58 +177,50 @@ def _jinja2_filter_extension_to_noun(ext):
 	else:
 		return "item"
 
-@app.template_filter('4chan_image')
-def _jinja2_filter_4chan_image(image_4chan, post_id, board, image_md5):
-
-	plebs_boards = ["adv","f","hr","mlpol","mo","o","pol","s4s","sp","tg","trv","tv","x"]
-	archivedmoe_boards = ["3","a","aco","adv","an","asp","b","bant","biz","c","can","cgl","ck","cm","co","cock","con","d","diy","e","f","fa","fap","fit","fitlit","g","gd","gif","h","hc","his","hm","hr","i","ic","int","jp","k","lgbt","lit","m","mlp","mlpol","mo","mtv","mu","n","news","o","out","outsoc","p","po","pol","pw","q","qa","qb","qst","r","r9k","s","s4s","sci","soc","sp","spa","t","tg","toy","trash","trv","tv","u","v","vg","vint","vip","vm","vmg","vp","vr","vrpg","vst","vt","w","wg","wsg","wsr","x","xs","y"]
-
-	headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0"}
+@app.template_filter('social_mediafy')
+def _jinja2_filter_social_mediafy(body, datasource=""):
+	# Adds links to a text body with hashtags, @-mentions, and URLs
+	# A data source must be given to generate the correct URLs. 
+
+	if not datasource:
+		return body
+
+	known_datasources = ["twitter", "tiktok", "instagram", "tumblr"]
+	if datasource not in known_datasources:
+		return body
+
+	base_urls = {
+		"twitter": {
+			"hashtag": "https://twitter.com/hashtag/",
+			"mention": "https://twitter.com/"
+		},
+		"tiktok": {
+			"hashtag": "https://tiktok.com/tag/",
+			"mention": "https://tiktok.com/@"
+		},
+		"instagram": {
+			"hasthag": "https://instagram.com/explore/tags/",
+			"mention": "https://instagram.com/"
+		},
+		"tumblr": {
+			"hashtag": "https://tumblr.com/tagged/",
+			"mention": "https://tumblr.com/"
+		}
+	}
 
-	img_link = None
-	thumb_link = image_4chan.split(".")
-	thumb_link = thumb_link[0][:4] + "/" + thumb_link[0][4:6] + "/" + thumb_link[0] + "s." + thumb_link[1]
+	# Add URL links
+	for url in urls_from_text(body):
+		body = re.sub("<a href='%s' target='_blank'>%s</a>" % (url, url))
 
-	# If the board is archived by 4plebs, check that site first
-	if board in plebs_boards:
+	# Add hashtag links
+	for tag in re.findall(r"#[\w0-9]+[^>]", body):
+		body = re.sub(tag, "<a href='%s' target='_blank'>%s</a>" % (base_urls[datasource]["hashtag"] + tag[1:], tag), body)
+		
+	# Add @-mention links
+	for mention in re.findall(r"@[\w0-9_]+[^>]", body):
+		body = re.sub(mention, "<a href='%s' target='_blank'>%s</a>" % (base_urls[datasource]["mention"] + mention[1:], mention), body)
 
-		# First we're going to try to get the image link through the 4plebs API.
-		api_url = "https://archive.4plebs.org/_/api/chan/post/?board=%s&num=%s" % (board, post_id)
-		try:
-			api_json = requests.get(api_url, headers=headers)
-		except requests.RequestException as e:
-		 	pass
-		if api_json.status_code != 200:
-			pass
-		try:
-			api_json = json.loads(api_json.content)
-			img_link = api_json.get("media", {}).get("thumb_link", "")
-		except json.JSONDecodeError:
-			pass
-		if img_link:
-			return img_link
-
-		# If that doesn't work, we can check whether we can retrieve the image directly.
-		# 4plebs has a back-referral system so that some filenames are translated.
-		# This means direct linking won't work for every image without API retrieval.
-		# So only show if we get a 200 status code.
-		img_page = requests.get("https://img.4plebs.org/boards/%s/thumb/%s" % (board, thumb_link), headers=headers)
-		if img_page.status_code == 200:
-			return "https://img.4plebs.org/boards/%s/thumb/%s" % (board, thumb_link)
-
-	# If the board is archived by archivedmoe, we can also check this resource
-	if board in archivedmoe_boards:
-		img_page = requests.get("https://archived.moe/files/%s/thumb/%s" % (board, thumb_link), headers=headers)
-		if img_page.status_code == 200:
-			return img_page
-
-	# If we couldn't retrieve the thumbnail yet, then we'll just give a search link
-	# and display it as a hidden image.
-	image_md5 = image_md5.replace("/", "_")
-	if board in plebs_boards:
-		return "retrieve:http://archive.4plebs.org/_/search/image/" + image_md5
-	# Archivedmoe as a last resort - has a lot of boards
-	return "retrieve:https://archived.moe/_/search/image/" + image_md5
+	return body
 
 @app.template_filter('parameter_str')
 def _jinja2_filter_parameter_str(url):

From 7a7af83d4e77864bf4113bd78af1c487582f5eda Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Wed, 1 May 2024 13:24:04 +0200
Subject: [PATCH 035/204] Improve social_mediafy regexes and add to templates

---
 webtool/lib/template_filters.py                   |  4 ++--
 webtool/static/css/explorer/instagram.css         | 15 ++++++++++-----
 webtool/static/css/explorer/twitter-import.css    |  6 ------
 .../explorer/datasource-templates/instagram.html  |  2 +-
 .../explorer/datasource-templates/tiktok.html     |  4 ++--
 5 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/webtool/lib/template_filters.py b/webtool/lib/template_filters.py
index 35fdfd03d..ef9bd9d7f 100644
--- a/webtool/lib/template_filters.py
+++ b/webtool/lib/template_filters.py
@@ -213,11 +213,11 @@ def _jinja2_filter_social_mediafy(body, datasource=""):
 		body = re.sub("<a href='%s' target='_blank'>%s</a>" % (url, url))
 
 	# Add hashtag links
-	for tag in re.findall(r"#[\w0-9]+[^>]", body):
+	for tag in re.findall(r"#[\w0-9]+[^>@#]", body):
 		body = re.sub(tag, "<a href='%s' target='_blank'>%s</a>" % (base_urls[datasource]["hashtag"] + tag[1:], tag), body)
 		
 	# Add @-mention links
-	for mention in re.findall(r"@[\w0-9_]+[^>]", body):
+	for mention in re.findall(r"@[\w0-9_]+[^>@#]", body):
 		body = re.sub(mention, "<a href='%s' target='_blank'>%s</a>" % (base_urls[datasource]["mention"] + mention[1:], mention), body)
 
 	return body
diff --git a/webtool/static/css/explorer/instagram.css b/webtool/static/css/explorer/instagram.css
index e0e8d2a64..881a9f359 100644
--- a/webtool/static/css/explorer/instagram.css
+++ b/webtool/static/css/explorer/instagram.css
@@ -49,7 +49,7 @@
     width: 100%;
     bottom: 5px;
     text-align: center;
-    font-size: 17px;
+    font-size: 22px;
     color: white;
     opacity: .6;
 }
@@ -58,6 +58,15 @@
 	padding: 20px;
 }
 
+.posts .post .body a, .posts .post a:hover {
+    color: #00376B;
+    text-decoration: none;
+}
+
+.time, .posts .post .comments a {
+	color: #7b7b7b;
+}
+
 .posts .external-url {
     position: absolute;
     bottom: 0;
@@ -65,10 +74,6 @@
     padding: 10px;
 }
 
-.time, .comments a {
-	color: #7b7b7b;
-}
-
 .posts .post-image {
 	max-width: 200px;
 	margin: 0 auto;
diff --git a/webtool/static/css/explorer/twitter-import.css b/webtool/static/css/explorer/twitter-import.css
index 2faf89fab..24199cad8 100644
--- a/webtool/static/css/explorer/twitter-import.css
+++ b/webtool/static/css/explorer/twitter-import.css
@@ -1,9 +1,3 @@
-/* 
-
-See https://github.com/digitalmethodsinitiative/4cat/wiki/Exploring-and-annotating-datasets for information on how to add custom CSS.
- 
- */
-
 body {
 	background-color: white;
 }
diff --git a/webtool/templates/explorer/datasource-templates/instagram.html b/webtool/templates/explorer/datasource-templates/instagram.html
index 4dcf5e048..0d133a7fd 100644
--- a/webtool/templates/explorer/datasource-templates/instagram.html
+++ b/webtool/templates/explorer/datasource-templates/instagram.html
@@ -32,7 +32,7 @@
 	{% if post.num_likes %} 
 	<div class="likes"><strong>{{ post.get("num_likes") | commafy }} likes</strong></div>
 	{% endif %}
-	<div class="body">{{ post.get("body") }}</div>
+	<div class="body">{{ post.get("body") | social_mediafy(datasource='twitter') | safe }}</div>
 	{% if post.num_comments %}
 		{% if pseudonymised %}
 		<div class="comments">{{ post.get("num_comments") | commafy }} comments</span>
diff --git a/webtool/templates/explorer/datasource-templates/tiktok.html b/webtool/templates/explorer/datasource-templates/tiktok.html
index 3f5044089..882d87ac7 100644
--- a/webtool/templates/explorer/datasource-templates/tiktok.html
+++ b/webtool/templates/explorer/datasource-templates/tiktok.html
@@ -28,8 +28,8 @@
 			</header>
 
 			<!-- Post body -->
-			<span class="body">{{ post.body | safe }}</span>
-
+			<span class="body">{{ post.body | social_mediafy(datasource='tiktok') | safe }}</span>
+			
 			<!-- Video thumbnail item -->
 			<div class="post-media">
 				<a href="{{ post.tiktok_url }}" target="_blank"><img src="{{ url_for('static', filename='img/go-to-media.jpg') }}"></a>

From e1dc2f2d0ac319df2260d4e1b7a6cbe791806b64 Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Wed, 1 May 2024 14:53:07 +0200
Subject: [PATCH 036/204] Allow reverse-sorting by dataset order

---
 webtool/views/views_explorer.py | 31 ++++++++++++++++++-------------
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/webtool/views/views_explorer.py b/webtool/views/views_explorer.py
index 018f1301b..380b1544f 100644
--- a/webtool/views/views_explorer.py
+++ b/webtool/views/views_explorer.py
@@ -80,8 +80,6 @@ def explorer_dataset(key, page=1):
 
 	# Check if we have to sort the data.
 	sort = request.args.get("sort")
-	if sort == "dataset-order":
-		sort = None
 
 	# Check if we have to reverse the order.
 	reverse = True if request.args.get("order") == "reverse" else False
@@ -91,9 +89,9 @@ def explorer_dataset(key, page=1):
 	posts = []
 	count = 0
 
-	# If we're sorting, we need to iterate over the entire
-	# dataset first. Else we can simply use `iterate_items`.
-	if not sort:
+	# We don't need to sort if we're showing the existing dataset order (the default).
+	# If we're sorting, we need to iterate over the entire dataset first.
+	if not sort or (sort == "dataset-order" and reverse == False):
 		for row in dataset.iterate_items(warn_unmappable=False):
 
 			count += 1
@@ -292,16 +290,23 @@ def sort_and_iterate_items(dataset, sort=None, reverse=False, **kwargs):
 	# Storing posts in the right order here
 	sorted_posts = []
 
-	try:
-		for item in sorted(dataset.iterate_items(**kwargs), key=lambda x: x[sort], reverse=reverse):
-			sorted_posts.append(item)
-	except TypeError:
-		# Dataset fields can contain integers and empty strings.
-		# Since these cannot be compared, we will convert every
-		# empty string to 0.
-		for item in sorted(dataset.iterate_items(**kwargs), key=lambda x: convert_to_float(x[sort]), reverse=reverse):
+	# Just use sorted(reverse=True) if we're reading from back to front.
+	if sort == "dataset-order" and reverse == True:
+		for item in reversed(list(dataset.iterate_items(**kwargs))):
 			sorted_posts.append(item)
 
+	# Sort on the basis of a column value
+	else:
+		try:
+			for item in sorted(dataset.iterate_items(**kwargs), key=lambda x: x[sort], reverse=reverse):
+				sorted_posts.append(item)
+		except TypeError:
+			# Dataset fields can contain integers and empty strings.
+			# Since these cannot be compared, we will convert every
+			# empty string to 0.
+			for item in sorted(dataset.iterate_items(**kwargs), key=lambda x: convert_to_float(x[sort]), reverse=reverse):
+				sorted_posts.append(item)
+
 	for post in sorted_posts:
 		yield post
 

From 19b463979e9df5cf44f2f8ae79b2d589e5f3c111 Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Wed, 1 May 2024 17:23:36 +0200
Subject: [PATCH 037/204] Add animations

---
 webtool/static/css/dataset-page.css           | 16 ++++++--
 webtool/static/js/explorer.js                 | 39 ++++++++++++++-----
 webtool/templates/explorer/controls.html      | 10 ++---
 .../templates/explorer/post-annotations.html  |  2 +-
 webtool/views/views_explorer.py               |  2 +-
 5 files changed, 48 insertions(+), 21 deletions(-)

diff --git a/webtool/static/css/dataset-page.css b/webtool/static/css/dataset-page.css
index 7ffb48624..bbd96c7bf 100644
--- a/webtool/static/css/dataset-page.css
+++ b/webtool/static/css/dataset-page.css
@@ -697,6 +697,11 @@ body.image-preview {
 }
 
 /* Explorer view */
+#annotation-fields-editor {
+    height: 0;
+    overflow: hidden;
+}
+
 #annotation-fields-editor-controls {
     display: grid;
     grid-template-columns: auto auto auto;
@@ -711,7 +716,7 @@ body.image-preview {
 }
 
 /* Remove all styles for explorer posts */
-/* these ought to be defined specifically */
+/* these ought to be defined specifically, */
 /* and 4CAT styles shouldn't interfere. */
 #explorer-posts, #explorer-posts > ol li {
     all: initial;
@@ -727,14 +732,17 @@ body.image-preview {
     padding: 0;
 }
 
-#explorer-content .hidden {
-    min-height: 0;
-    max-height: 0;
+.post-annotations {
+    height: 0;
     overflow: hidden;
     box-sizing: border-box;
     border-width: 0;
 }
 
+.annotation-control-button {
+    min-width: 140px;
+}
+
 .annotation-field-label.invalid {
     border: 1px solid red;
 }
\ No newline at end of file
diff --git a/webtool/static/js/explorer.js b/webtool/static/js/explorer.js
index efd77cacd..3602cf39d 100644
--- a/webtool/static/js/explorer.js
+++ b/webtool/static/js/explorer.js
@@ -25,6 +25,7 @@ const annotations = {
 		let edit_field_box = $("#edit-annotation-fields");
 		let editor = $("#annotation-fields-editor");
 		let editor_controls = $("#annotation-fields-editor-controls");
+		var edits_made = false;
 
 		// Add a new annotation field when clicking the plus icon
 		$("#new-annotation-field").on("click", function(){
@@ -32,13 +33,22 @@ const annotations = {
 			$(annotations_div).insertBefore(edit_field_box);});
 
 		// Show and hide the annotations editor
-		$("#toggle-annotation-fields").on("click", function(){
-			editor.toggleClass("hidden");
-			if (editor.hasClass("hidden")) {
+		let toggle_fields = $("#toggle-annotation-fields")
+		toggle_fields.on("click", function(){
+			if (toggle_fields.hasClass("shown")) {
 				$("#toggle-annotation-fields").html("<i class='fas fa-edit'></i> Edit fields");
+				toggle_fields.removeClass("shown");
+				editor.animate({"height": 0}, 250);
 			}
 			else {
 				$("#toggle-annotation-fields").html("<i class='fas fa-eye-slash'></i> Hide editor");
+				toggle_fields.addClass("shown");
+				// Bit convoluted, but necessary to restore auto height
+				current_height = editor.height();
+				auto_height = editor.css("height", "auto").height();
+				editor.height(current_height).animate({"height": auto_height}, 250, function(){
+					editor.height("auto");
+				});
 			}
 		});
 
@@ -95,21 +105,21 @@ const annotations = {
 		});
 		
 		// Make saving available when annotations are changed
-		$(".post-annotations").on("keydown", "input, textarea", function() { annotations.enableSaving(); });
-		$(".post-annotations").on("click", "option, input[type=checkbox], label", function() { annotations.enableSaving(); });
+		$(".post-annotations").on("keydown", "input, textarea", function() { annotations.enableSaving(); edits_made = true;});
+		$(".post-annotations").on("click", "option, input[type=checkbox], label", function() { annotations.enableSaving(); edits_made = true;});
 
 		// Keep track of whether the annotations are edited or not.
 		$(".post-annotations").on("change", ".post-annotation-input, .post-annotation input, .post-annotation textarea", function(){$(this).addClass("edited")});
 
 		// Save the annotations to the database
 		$("#save-annotations").on("click", function(){
-			if (!$(this).hasClass("invalid")) {
+			if (!$(this).hasClass("disabled")) {
 				annotations.saveAnnotations();
 			}
 		});
 
 		$("#save-to-dataset").on("click", function(){
-			if (!$(this).hasClass("invalid")) {
+			if (!$(this).hasClass("disabled")) {
 				annotations.saveAnnotations();
 				annotations.writeAnnotations();
 			}
@@ -129,7 +139,9 @@ const annotations = {
 
 		// Save annotations every 10 seconds
 		setInterval(function() {
-			annotations.saveAnnotations();
+			if (!$("#save-annotations").hasClass("disabled") && edits_made) {
+				annotations.saveAnnotations();
+			}
 		}, 10000);
 
 	},
@@ -791,14 +803,21 @@ const annotations = {
 		ta.addClass("shown");
 		ta.removeClass("disabled");
 		ta.html("<i class='fas fa-eye-slash'></i> Hide annotations");
-		$(".post-annotations").removeClass("hidden");
+		// Bit convoluted, but necessary to have auto height
+		let pa = $(".post-annotations");
+		current_height = pa.height();
+		auto_height = pa.css("height", "auto").height();
+		pa.height(current_height).animate({"height": auto_height}, 250, function(){
+			pa.height("auto");
+		});
 	},
 
 	hideAnnotations: function() {
 		let ta = $("#toggle-annotations");
 		ta.removeClass("shown");
 		ta.html("<i class='fas fa-eye'></i> Show annotations");
-		$(".post-annotations").addClass("hidden");
+		let pa = $(".post-annotations");
+		pa.animate({"height": 0}, 250);
 	},
 
 	getAnnotationsDiv: function(id){
diff --git a/webtool/templates/explorer/controls.html b/webtool/templates/explorer/controls.html
index f34fdf41d..0c188557a 100644
--- a/webtool/templates/explorer/controls.html
+++ b/webtool/templates/explorer/controls.html
@@ -20,16 +20,16 @@ <h2>
 				<div class="fullwidth dataset-toolbox">
 					<dt>Annotations</dt>
 					<ul>
-						<li>
+						<li class="annotation-control-button">
 							<a class="button-like-small" id="toggle-annotation-fields"><i class="fas fa-edit"></i> Edit fields</a>
 						</li>
-						<li><a class="button-like-small {% if not annotation_fields %}disabled{% endif %}" id="toggle-annotations"><i class="fas fa-eye"></i> Show annotations</li></a>
-						<li><a class="button-like-small {% if annotation_fields and annotations %}disabled{% endif %}" id="save-annotations"><i class="fas fa-save"></i> Save annotations</li></a>
-						<li><a class="button-like-small {% if annotation_fields and annotations %}disabled{% endif %}" id="save-to-dataset"><i class="fa-solid fa-right-from-bracket"></i> Write to dataset</li></a>
+						<li class="annotation-control-button"><a class="button-like-small{% if not annotation_fields %}disabled{% endif %}" id="toggle-annotations"><i class="fas fa-eye"></i> Show annotations</li></a>
+						<li class="annotation-control-button"><a class="button-like-small{% if annotation_fields and annotations %}disabled{% endif %}" id="save-annotations"><i class="fas fa-save"></i> Save annotations</li></a>
+						<li class="annotation-control-button"><a class="button-like-small{% if annotation_fields and annotations %}disabled{% endif %}" id="save-to-dataset"><i class="fa-solid fa-right-from-bracket"></i> Write to dataset</li></a>
 					</ul>
 				</div>
 				
-				<div class="fullwidth hidden" id ="annotation-fields-editor">
+				<div class="fullwidth" id ="annotation-fields-editor">
 					<dt>Edit annotation fields</dt>	
 					{% include "explorer/annotations-editor.html" %}
 				</div>
diff --git a/webtool/templates/explorer/post-annotations.html b/webtool/templates/explorer/post-annotations.html
index dd65f15a0..e6f4c731f 100644
--- a/webtool/templates/explorer/post-annotations.html
+++ b/webtool/templates/explorer/post-annotations.html
@@ -1,4 +1,4 @@
-<div class="post-annotations hidden">
+<div class="post-annotations">
 	
 	{% if annotation_fields %}
 		{% set old_annotations = None %}
diff --git a/webtool/views/views_explorer.py b/webtool/views/views_explorer.py
index 380b1544f..61fb467e6 100644
--- a/webtool/views/views_explorer.py
+++ b/webtool/views/views_explorer.py
@@ -290,7 +290,7 @@ def sort_and_iterate_items(dataset, sort=None, reverse=False, **kwargs):
 	# Storing posts in the right order here
 	sorted_posts = []
 
-	# Just use sorted(reverse=True) if we're reading from back to front.
+	# Use reversed() if we're reading the dataset from back to front.
 	if sort == "dataset-order" and reverse == True:
 		for item in reversed(list(dataset.iterate_items(**kwargs))):
 			sorted_posts.append(item)

From 67a8298c26c48ed75d3350e2538207e96d2c00b9 Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Wed, 1 May 2024 17:39:19 +0200
Subject: [PATCH 038/204] There needs to be something to save!

---
 common/lib/dataset.py         | 2 +-
 webtool/static/js/explorer.js | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/common/lib/dataset.py b/common/lib/dataset.py
index 9ca56c264..3030dbc01 100644
--- a/common/lib/dataset.py
+++ b/common/lib/dataset.py
@@ -1003,7 +1003,7 @@ def save_annotations(self, annotations):
 		# If so, we're gonna edit or remove their old values.
 		old_annotations = self.get_annotations()
 
-		if old_annotations:
+		if old_annotations and annotations:
 			# Loop through all new annotations and add/overwrite them
 			# with the old annotations dict.
 			for post_id in list(annotations.keys()):
diff --git a/webtool/static/js/explorer.js b/webtool/static/js/explorer.js
index 3602cf39d..e904b65de 100644
--- a/webtool/static/js/explorer.js
+++ b/webtool/static/js/explorer.js
@@ -318,7 +318,6 @@ const annotations = {
 			}
 		});
 
-		console.log(annotation_fields)
 		if (warning.length > 0) {
 			return warning;
 		}

From 5a2862b5577e1baca078d97691902581484f7b91 Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Thu, 2 May 2024 10:57:05 +0200
Subject: [PATCH 039/204] Add coauhtors to instagram map_item and add to
 Explorer template

---
 datasources/instagram/search_instagram.py           | 13 ++++++++++++-
 webtool/static/css/explorer/instagram.css           |  5 -----
 .../explorer/datasource-templates/instagram.html    |  8 ++++++--
 3 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/datasources/instagram/search_instagram.py b/datasources/instagram/search_instagram.py
index cd57862b3..6b3768095 100644
--- a/datasources/instagram/search_instagram.py
+++ b/datasources/instagram/search_instagram.py
@@ -223,6 +223,14 @@ def parse_itemlist_item(node):
             if user.get("username") != owner.get("username"):
                 raise MapItemException("Unable to parse item: different user and owner")
 
+        # Instagram posts also allow 'Collabs' with up to one co-author
+        coauthor = {"coauthor": "", "coauthor_fullname": "", "coauthor_id": ""}
+        if node.get("coauthor_producers"):
+            coauthor_node = node["coauthor_producers"][0]
+            coauthor["coauthor"] = coauthor_node.get("username")
+            coauthor["coauthor_fullname"] = coauthor_node.get("full_name")
+            coauthor["coauthor_id"] = coauthor_node.get("id")
+
         mapped_item = {
             "id": node["code"],
             "post_source_domain": node.get("__import_meta", {}).get("source_platform_url"), # Zeeschuimer metadata
@@ -231,8 +239,11 @@ def parse_itemlist_item(node):
             "body": caption,
             "author": user.get("username", owner.get("username", MissingMappedField(""))),
             "author_fullname": user.get("full_name", owner.get("full_name", MissingMappedField(""))),
-            "is_verified": True if user.get("is_verified") else False,
+            "verified": True if user.get("is_verified") else False,
             "author_avatar_url": user.get("profile_pic_url", owner.get("profile_pic_url", MissingMappedField(""))),
+            "coauthor": coauthor["coauthor"],
+            "coauthor_fullname": coauthor["coauthor_fullname"],
+            "coauthor_id": coauthor["coauthor_id"],
             "timestamp": datetime.datetime.fromtimestamp(node["taken_at"]).strftime("%Y-%m-%d %H:%M:%S"),
             "type": media_type,
             "url": "https://www.instagram.com/p/" + node["code"],
diff --git a/webtool/static/css/explorer/instagram.css b/webtool/static/css/explorer/instagram.css
index 881a9f359..80cbcbd58 100644
--- a/webtool/static/css/explorer/instagram.css
+++ b/webtool/static/css/explorer/instagram.css
@@ -18,11 +18,6 @@
 	border: none;
 }
 
-.posts header span:not(:last-child)::after {
-    content: '\2022';
-    margin-left: 0.5em;
-}
-
 .posts header i.verified {
 	color: #0095f6;
 }
diff --git a/webtool/templates/explorer/datasource-templates/instagram.html b/webtool/templates/explorer/datasource-templates/instagram.html
index 0d133a7fd..4d93df948 100644
--- a/webtool/templates/explorer/datasource-templates/instagram.html
+++ b/webtool/templates/explorer/datasource-templates/instagram.html
@@ -1,10 +1,14 @@
 <header>
 
-	<!-- Possible external link, if not pseudonymised -->
 	{% if not pseudonymised %}
+		<!-- Possible external link, if not pseudonymised -->
 		<a href="{{ post.url }}" target="_blank"><span class="external-url" title="Go to original post"><i class="fas fa-external-link-alt"></i></span></a>
+		<!-- Author and co-author -->
 		<span class="author"><strong><a href="https://instagram.com/{{ post.get('author') }}">{{ post.get("author") }}</strong></a>
-			{% if post.get("is_verified") %} <i class="fa-solid fa-circle-check verified"></i>{% endif %}</span>
+		{% if post.get("is_verified") %} <i class="fa-solid fa-circle-check verified"></i>{% endif %}</span>
+		{% if post.get("coauthor") %}
+		and <span class="author"><strong><a href="https://instagram.com/{{ post.get('coauthor') }}">{{ post.get("coauthor") }}</strong></a>
+		{% endif %}
 		<span class="time">{{ post.get("timestamp") }}</span>
 	{% else %}
 		<span title="Pseudonymous author" class="author">

From 67f0746f75b4b10c39f5553b4d9aa7e391e5e11a Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Thu, 2 May 2024 15:37:41 +0200
Subject: [PATCH 040/204] Better social_mediafy regexes, implement per platform

---
 webtool/lib/template_filters.py               | 20 ++++++++++++-------
 .../datasource-templates/instagram.html       |  2 +-
 .../datasource-templates/twitter.html         |  2 +-
 3 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/webtool/lib/template_filters.py b/webtool/lib/template_filters.py
index ef9bd9d7f..656425007 100644
--- a/webtool/lib/template_filters.py
+++ b/webtool/lib/template_filters.py
@@ -185,10 +185,12 @@ def _jinja2_filter_social_mediafy(body, datasource=""):
 	if not datasource:
 		return body
 
-	known_datasources = ["twitter", "tiktok", "instagram", "tumblr"]
+	# Supported data sources
+	known_datasources = set("twitter", "tiktok", "instagram", "tumblr")
 	if datasource not in known_datasources:
 		return body
 
+	# Base URLs after which tags and @-mentions follow, per platform
 	base_urls = {
 		"twitter": {
 			"hashtag": "https://twitter.com/hashtag/",
@@ -210,15 +212,19 @@ def _jinja2_filter_social_mediafy(body, datasource=""):
 
 	# Add URL links
 	for url in urls_from_text(body):
-		body = re.sub("<a href='%s' target='_blank'>%s</a>" % (url, url))
+		body = re.sub(url, "<a href='%s' target='_blank'>%s</a>" % (url, url), body)
 
 	# Add hashtag links
-	for tag in re.findall(r"#[\w0-9]+[^>@#]", body):
-		body = re.sub(tag, "<a href='%s' target='_blank'>%s</a>" % (base_urls[datasource]["hashtag"] + tag[1:], tag), body)
-		
+	tags = re.findall(r"#[\w0-9]+", body)
+	for tag in tags:
+		# Match the string, but not if it's preceded by a >, which indicates that we've already added an <a> tag.
+		# This fixes problems with substrings (e.g. #Dog and #DogOwners).
+		body = re.sub(r"(?<!>)(" + tag + ")", "<a href='%s' target='_blank'>%s</a>" % (base_urls[datasource]["hashtag"] + tag[1:], tag), body)
+
 	# Add @-mention links
-	for mention in re.findall(r"@[\w0-9_]+[^>@#]", body):
-		body = re.sub(mention, "<a href='%s' target='_blank'>%s</a>" % (base_urls[datasource]["mention"] + mention[1:], mention), body)
+	mentions = re.findall(r"@[\w0-9]+", body)
+	for mention in mentions:
+		body = re.sub(r"(?<!>)(" + mention + ")", "<a href='%s' target='_blank'>%s</a>" % (base_urls[datasource]["mention"] + mention[1:], mention), body)
 
 	return body
 
diff --git a/webtool/templates/explorer/datasource-templates/instagram.html b/webtool/templates/explorer/datasource-templates/instagram.html
index 4d93df948..b4b3d677b 100644
--- a/webtool/templates/explorer/datasource-templates/instagram.html
+++ b/webtool/templates/explorer/datasource-templates/instagram.html
@@ -36,7 +36,7 @@
 	{% if post.num_likes %} 
 	<div class="likes"><strong>{{ post.get("num_likes") | commafy }} likes</strong></div>
 	{% endif %}
-	<div class="body">{{ post.get("body") | social_mediafy(datasource='twitter') | safe }}</div>
+	<div class="body">{{ post.get("body") | social_mediafy(datasource='instagram') | safe }}</div>
 	{% if post.num_comments %}
 		{% if pseudonymised %}
 		<div class="comments">{{ post.get("num_comments") | commafy }} comments</span>
diff --git a/webtool/templates/explorer/datasource-templates/twitter.html b/webtool/templates/explorer/datasource-templates/twitter.html
index 2760a7941..6d85ee8e7 100644
--- a/webtool/templates/explorer/datasource-templates/twitter.html
+++ b/webtool/templates/explorer/datasource-templates/twitter.html
@@ -19,7 +19,7 @@
 
 			</header>
 
-			<div class="body">{{ post.body }}</div>
+			<div class="body">{{ post.body | social_mediafy(datasource='twitter') | safe }}</div>
 			<!-- Media item -->
 			{% if post.get("images") %}
 				{% set media_url = post.get("images") %}

From 3e2866755783773d47858755d62954b768c27dfc Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Mon, 13 May 2024 12:53:15 +0200
Subject: [PATCH 041/204] Typo in LinkedIn search

---
 datasources/linkedin/search_linkedin.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/datasources/linkedin/search_linkedin.py b/datasources/linkedin/search_linkedin.py
index ef29353d4..34940243f 100644
--- a/datasources/linkedin/search_linkedin.py
+++ b/datasources/linkedin/search_linkedin.py
@@ -119,7 +119,7 @@ def map_item(item):
         link_url = ""
         if item.get("content") and item["content"].get("navigationContext"):
             link_url = item["content"]["navigationContext"].get("actionTarget", "")
-        elif item.get("content") and item["content"].get("articleComponent") and "navigationContext" in post["content"]["articleComponent"]:
+        elif item.get("content") and item["content"].get("articleComponent") and "navigationContext" in item["content"]["articleComponent"]:
             link_url = item["content"]["articleComponent"]["navigationContext"].get("actionTarget", "")
 
         return MappedItem({

From 8f60c3a682f491ec398ba816443e978e260444c7 Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Tue, 14 May 2024 18:07:27 +0200
Subject: [PATCH 042/204] Add quote tweet info to Twitter map_item()

---
 datasources/twitter-import/search_twitter.py | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/datasources/twitter-import/search_twitter.py b/datasources/twitter-import/search_twitter.py
index 79a875132..9b065681f 100644
--- a/datasources/twitter-import/search_twitter.py
+++ b/datasources/twitter-import/search_twitter.py
@@ -14,7 +14,7 @@
 
 class SearchTwitterViaZeeschuimer(Search):
     """
-    Import scraped Imgur data
+    Import scraped Twitter data
     """
     type = "twitter-import"  # job ID
     category = "Search"  # category
@@ -34,7 +34,7 @@ def get_items(self, query):
         """
         Run custom search
 
-        Not available for Imgur
+        Not available for Twitter
         """
         raise NotImplementedError("Twitter datasets can only be created by importing data from elsewhere")
 
@@ -67,6 +67,7 @@ def map_item_modern(tweet):
                 tweet["legacy"]["full_text"] = t_text
 
         quote_tweet = tweet.get("quoted_status_result")
+
         if quote_tweet and "tweet" in quote_tweet.get("result", {}):
             # sometimes this is one level deeper, sometimes not...
             quote_tweet["result"] = quote_tweet["result"]["tweet"]
@@ -94,10 +95,14 @@ def map_item_modern(tweet):
             "is_retweet": "yes" if retweet else "no",
             "retweeted_user": retweet["result"]["core"]["user_results"]["result"].get("legacy", {}).get("screen_name", "") if retweet else "",
             "is_quote_tweet": "yes" if quote_tweet else "no",
-            "quoted_user": quote_tweet["result"]["core"]["user_results"]["result"].get("legacy", {}).get("screen_name", "") if (quote_tweet and "tombstone" not in quote_tweet["result"]) else "",
+            "quote_tweet_id": quote_tweet["result"].get("rest_id") if quote_tweet else "",
+            "quote_author": quote_tweet["result"]["core"]["user_results"]["result"].get("legacy", {}).get("screen_name", "") if (quote_tweet and "tombstone" not in quote_tweet["result"]) else "",
+            "quote_body": quote_tweet["result"]["legacy"].get("full_text","") if quote_tweet else "",
+            "quote_images": ",".join([media["media_url_https"] for media in quote_tweet["result"]["legacy"].get("entities", {}).get("media", []) if media["type"] == "photo"]) if quote_tweet else "",
+            "quote_videos": ",".join([media["media_url_https"] for media in quote_tweet["result"]["legacy"].get("entities", {}).get("media", []) if media["type"] == "video"]) if quote_tweet else "",
             "is_quote_withheld": "yes" if (quote_tweet and "tombstone" in quote_tweet["result"]) else "no",
             "is_reply": "yes" if str(tweet["legacy"]["conversation_id_str"]) != str(tweet["rest_id"]) else "no",
-            "replied_user": tweet["legacy"].get("in_reply_to_screen_name", ""),
+            "replied_author": tweet["legacy"].get("in_reply_to_screen_name", ""),
             "is_withheld": "yes" if withheld else "no",
             "hashtags": ",".join([hashtag["text"] for hashtag in tweet["legacy"]["entities"]["hashtags"]]),
             "urls": ",".join([url["expanded_url"] for url in tweet["legacy"]["entities"]["urls"]]),
@@ -155,9 +160,9 @@ def map_item_legacy(tweet):
             "is_retweet": "yes" if retweet else "no",
             "retweeted_user": retweet["result"]["core"]["user_results"]["result"].get("legacy", {}).get("screen_name", "") if retweet else "",
             "is_quote_tweet": "yes" if quote_tweet else "no",
-            "quoted_user": quote_tweet["result"]["core"]["user_results"]["result"].get("legacy", {}).get("screen_name", "") if quote_tweet else "",
+            "quote_author": quote_tweet["result"]["core"]["user_results"]["result"].get("legacy", {}).get("screen_name", "") if quote_tweet else "",
             "is_reply": "yes" if str(tweet["legacy"]["conversation_id_str"]) != tweet_id else "no",
-            "replied_user": tweet["legacy"].get("in_reply_to_screen_name", "") if tweet["legacy"].get(
+            "replied_author": tweet["legacy"].get("in_reply_to_screen_name", "") if tweet["legacy"].get(
                 "in_reply_to_screen_name") else "",
             "is_withheld": "yes" if withheld else "no",
             "hashtags": ",".join([hashtag["text"] for hashtag in tweet["legacy"]["entities"]["hashtags"]]),

From 580a08302a12c3c1248b76bee756cdb7022edf73 Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Tue, 14 May 2024 18:07:36 +0200
Subject: [PATCH 043/204] No sets!

---
 webtool/lib/template_filters.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/webtool/lib/template_filters.py b/webtool/lib/template_filters.py
index 656425007..974706ff3 100644
--- a/webtool/lib/template_filters.py
+++ b/webtool/lib/template_filters.py
@@ -186,7 +186,7 @@ def _jinja2_filter_social_mediafy(body, datasource=""):
 		return body
 
 	# Supported data sources
-	known_datasources = set("twitter", "tiktok", "instagram", "tumblr")
+	known_datasources = ["twitter", "tiktok", "instagram", "tumblr"]
 	if datasource not in known_datasources:
 		return body
 

From fa9595cb592a44e93154ad0d45da3af9906c7be7 Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Tue, 14 May 2024 18:08:01 +0200
Subject: [PATCH 044/204] Add quote tweet information to Twitter Explorer
 template

---
 webtool/static/css/explorer/twitter.css       |  8 ++++-
 .../datasource-templates/twitter.html         | 35 +++++++++++++++++--
 2 files changed, 39 insertions(+), 4 deletions(-)

diff --git a/webtool/static/css/explorer/twitter.css b/webtool/static/css/explorer/twitter.css
index 6ad1aeeb1..b17aa8e0e 100644
--- a/webtool/static/css/explorer/twitter.css
+++ b/webtool/static/css/explorer/twitter.css
@@ -69,7 +69,7 @@
 	margin-top: 20px;
 }
 
-.time, .metrics {
+.time, .metrics, .atname {
 	color: #7a8a97;
 }
 
@@ -92,6 +92,12 @@ span.hashtag {
 	color: rgb(29, 155, 240);
 }
 
+.post .quote-post {
+	padding: 20px;
+	border: 1px solid #efefef;
+	border-radius: 10px;
+}
+
 
 /** --------------------- *
      Annotation post elements
diff --git a/webtool/templates/explorer/datasource-templates/twitter.html b/webtool/templates/explorer/datasource-templates/twitter.html
index 6d85ee8e7..235fee353 100644
--- a/webtool/templates/explorer/datasource-templates/twitter.html
+++ b/webtool/templates/explorer/datasource-templates/twitter.html
@@ -20,6 +20,7 @@
 			</header>
 
 			<div class="body">{{ post.body | social_mediafy(datasource='twitter') | safe }}</div>
+			
 			<!-- Media item -->
 			{% if post.get("images") %}
 				{% set media_url = post.get("images") %}
@@ -33,9 +34,37 @@
 				{% endfor %}
 			</div>
 			{% endif %}
-			{% if post.get("quoted_user") %}
-			<div class="quoted-user">
-				Tweet by @{{ post.quoted_user }}
+
+			<!-- Quote tweet -->
+			{% if post.get("quote_author") %}
+			<div class="quote-post">
+				<header>
+					
+				{% if not pseudonymised %}
+					<a href="https://twitter.com/{{ post.get('quote_author') }}"><span class="quote-author atname">@{{ post.get("quote_author") }}</span></a>
+				{% else %}
+					<span title="Pseudonymous author" class="quote-author"><i class="fa fa-user-secret tooltip-trigger"></i></span>
+				{% endif %}
+				</header>
+
+				{% if post.quote_body %}
+				<div class="body">
+					{{ post.quote_body | social_mediafy(datasource='twitter') | safe }}
+				</div>
+				{% endif %}
+				<!-- Quote tweet media item -->
+				{% if post.get("quote_images") %}
+					{% set media_url = post.get("quote_images") %}
+				{% elif post.get("quote_videos") %}
+					{% set media_url = post.get("quote_videos") %}
+				{% endif %}
+				{% if media_url %}
+				<div class="media-container">
+					{% for url in media_url.split(",") %}
+						<a href="{{ url }}" target="_blank"><img src="{{ url }}"></a>
+					{% endfor %}
+				</div>
+				{% endif %}
 			</div>
 			{% endif %}
 			<div class="metrics">

From e76f18c9bb491d368fcbe79070add9787da8fbd8 Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Tue, 14 May 2024 13:13:34 +0200
Subject: [PATCH 045/204] No telegram CSS yet

---
 webtool/static/css/explorer/telegram.css | 3 ---
 1 file changed, 3 deletions(-)
 delete mode 100644 webtool/static/css/explorer/telegram.css

diff --git a/webtool/static/css/explorer/telegram.css b/webtool/static/css/explorer/telegram.css
deleted file mode 100644
index 8b6e10cd1..000000000
--- a/webtool/static/css/explorer/telegram.css
+++ /dev/null
@@ -1,3 +0,0 @@
-* {
-	color: gold;
-}
\ No newline at end of file

From 544cd91731faca1c9183af3f1b2a52f591a672ab Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Tue, 14 May 2024 13:13:52 +0200
Subject: [PATCH 046/204] Make quote tweets just that little bit nicer.

---
 webtool/static/css/explorer/twitter.css | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/webtool/static/css/explorer/twitter.css b/webtool/static/css/explorer/twitter.css
index b17aa8e0e..3065f967e 100644
--- a/webtool/static/css/explorer/twitter.css
+++ b/webtool/static/css/explorer/twitter.css
@@ -93,9 +93,10 @@ span.hashtag {
 }
 
 .post .quote-post {
-	padding: 20px;
+	padding: 15px 20px 15px 20px;
 	border: 1px solid #efefef;
 	border-radius: 10px;
+	margin-top: 5px;
 }
 
 

From 432bdeca41e0cc549ffb36a6017b2d34d18ef05d Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Tue, 14 May 2024 13:14:44 +0200
Subject: [PATCH 047/204] Fix tag links in template filter

---
 webtool/lib/template_filters.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/webtool/lib/template_filters.py b/webtool/lib/template_filters.py
index 974706ff3..7f1940449 100644
--- a/webtool/lib/template_filters.py
+++ b/webtool/lib/template_filters.py
@@ -216,13 +216,17 @@ def _jinja2_filter_social_mediafy(body, datasource=""):
 
 	# Add hashtag links
 	tags = re.findall(r"#[\w0-9]+", body)
+	# We're sorting tags by length so we don't incorrectly
+	# replace tags that are a substring of another, longer tag.
+	tags = sorted(tags, key=lambda x: len(x), reverse=True)
 	for tag in tags:
 		# Match the string, but not if it's preceded by a >, which indicates that we've already added an <a> tag.
-		# This fixes problems with substrings (e.g. #Dog and #DogOwners).
-		body = re.sub(r"(?<!>)(" + tag + ")", "<a href='%s' target='_blank'>%s</a>" % (base_urls[datasource]["hashtag"] + tag[1:], tag), body)
+		# This avoids problems with repeated substrings (e.g. #Dog and #DogOwners).
+		body = re.sub(r"(?<!'>)(" + tag + ")", "<a href='%s' target='_blank'>%s</a>" % (base_urls[datasource]["hashtag"] + tag[1:], tag), body)
 
 	# Add @-mention links
 	mentions = re.findall(r"@[\w0-9]+", body)
+	mentions = sorted(mentions, key=lambda x: len(x), reverse=True)
 	for mention in mentions:
 		body = re.sub(r"(?<!>)(" + mention + ")", "<a href='%s' target='_blank'>%s</a>" % (base_urls[datasource]["mention"] + mention[1:], mention), body)
 

From aeaec85a76cc80f33fe11b9a6152e9d450aba6f6 Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Tue, 14 May 2024 13:44:03 +0200
Subject: [PATCH 048/204] Fix bug in saving annotation fields (misnamed
 variable)

---
 common/lib/dataset.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/common/lib/dataset.py b/common/lib/dataset.py
index 3030dbc01..7a2ea776b 100644
--- a/common/lib/dataset.py
+++ b/common/lib/dataset.py
@@ -847,7 +847,7 @@ def get_annotations(self):
 	def save_annotation_fields(self, annotation_fields):
 		"""
 		Save the annotation fields of a dataset to the datasets table.
-		If changes to the annotation fields affect existing annotations,
+		If changes to the annotation fields affect older, existing annotations,
 		this function also updates or deleted those values.
 
 		:param dict annotation_fields:  Annotation fields, with a field ID as key
@@ -889,7 +889,7 @@ def save_annotation_fields(self, annotation_fields):
 					continue
 
 				# If the type has changed, also delete prior references (except between text and textarea)
-				new_type = new_fields[field_id]["type"]
+				new_type = annotation_fields[field_id]["type"]
 				if field["type"] != new_type:
 
 					if not field["type"] in text_fields and not new_type in text_fields:
@@ -898,7 +898,7 @@ def save_annotation_fields(self, annotation_fields):
 
 				# If the label has changed, change it in the old annotations
 				old_label = old_fields[field_id]["label"]
-				new_label = new_fields[field_id]["label"]
+				new_label = annotation_fields[field_id]["label"]
 
 				if old_label != new_label:
 					labels_to_update[old_label] = new_label
@@ -909,7 +909,7 @@ def save_annotation_fields(self, annotation_fields):
 					if "options" in old_fields[field_id]:
 
 						option_fields.add(old_fields[field_id]["label"])
-						new_options = new_fields[field_id]["options"]
+						new_options = annotation_fields[field_id]["options"]
 
 						new_ids = [list(v.keys())[0] for v in new_options]
 						new_ids = [list(v.keys())[0] for v in new_options]

From edc89caad8ee7b20fa4ae7698e5606767955accb Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Tue, 14 May 2024 13:44:27 +0200
Subject: [PATCH 049/204] Fix when Save buttons are enabled/disabled

---
 webtool/static/js/explorer.js            | 4 +++-
 webtool/templates/explorer/controls.html | 6 +++---
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/webtool/static/js/explorer.js b/webtool/static/js/explorer.js
index e904b65de..f46a0a9e9 100644
--- a/webtool/static/js/explorer.js
+++ b/webtool/static/js/explorer.js
@@ -127,7 +127,7 @@ const annotations = {
 
 		// Ask whether the next page should be opened without saving annotations
 		$('a > .page').click(function(){
-			if (!$("#save-annotations").prop('disabled')) {
+			if (!$("#save-annotations").hassClass('disabled')) {
 				return confirm("Unsaved annotations are lost if you don't save before leaving the page.\nLeave anyway?");
 			}
 		})
@@ -724,12 +724,14 @@ const annotations = {
 					annotations.enableSaving();
 					$("#save-annotations").html("<i class='fas fa-save'></i> Save annotations");
 					alert("Could't save annotations");
+					$("#save-annotations").removeClass("disabled");
 					console.log(response);
 				}
 			},
 			error: function (error) {
 				annotations.enableSaving();
 				$("#save-annotations").html("<i class='fas fa-save'></i> Save annotations");
+				$("#save-annotations").removeClass("disabled");
 				//alert("Could't save annotations");
 				console.log(error)
 			}
diff --git a/webtool/templates/explorer/controls.html b/webtool/templates/explorer/controls.html
index 0c188557a..78a409719 100644
--- a/webtool/templates/explorer/controls.html
+++ b/webtool/templates/explorer/controls.html
@@ -23,9 +23,9 @@ <h2>
 						<li class="annotation-control-button">
 							<a class="button-like-small" id="toggle-annotation-fields"><i class="fas fa-edit"></i> Edit fields</a>
 						</li>
-						<li class="annotation-control-button"><a class="button-like-small{% if not annotation_fields %}disabled{% endif %}" id="toggle-annotations"><i class="fas fa-eye"></i> Show annotations</li></a>
-						<li class="annotation-control-button"><a class="button-like-small{% if annotation_fields and annotations %}disabled{% endif %}" id="save-annotations"><i class="fas fa-save"></i> Save annotations</li></a>
-						<li class="annotation-control-button"><a class="button-like-small{% if annotation_fields and annotations %}disabled{% endif %}" id="save-to-dataset"><i class="fa-solid fa-right-from-bracket"></i> Write to dataset</li></a>
+						<li class="annotation-control-button"><a class="button-like-small{% if not annotation_fields %} disabled{% endif %}" id="toggle-annotations"><i class="fas fa-eye"></i> Show annotations</li></a>
+						<li class="annotation-control-button"><a class="button-like-small{% if annotation_fields and annotations %} disabled{% endif %}" id="save-annotations"><i class="fas fa-save"></i> Save annotations</li></a>
+						<li class="annotation-control-button"><a class="button-like-small{% if annotation_fields and annotations %} disabled{% endif %}" id="save-to-dataset"><i class="fa-solid fa-right-from-bracket"></i> Write to dataset</li></a>
 					</ul>
 				</div>
 				

From 36d458905fbcddc253da7c848edde0ab671f63ba Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Tue, 11 Jun 2024 18:39:06 +0200
Subject: [PATCH 050/204] Fix set bug

---
 webtool/lib/template_filters.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/webtool/lib/template_filters.py b/webtool/lib/template_filters.py
index 656425007..974706ff3 100644
--- a/webtool/lib/template_filters.py
+++ b/webtool/lib/template_filters.py
@@ -186,7 +186,7 @@ def _jinja2_filter_social_mediafy(body, datasource=""):
 		return body
 
 	# Supported data sources
-	known_datasources = set("twitter", "tiktok", "instagram", "tumblr")
+	known_datasources = ["twitter", "tiktok", "instagram", "tumblr"]
 	if datasource not in known_datasources:
 		return body
 

From eb51d40ea334a30f1b9d7015f763fddd493c441e Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Tue, 11 Jun 2024 18:39:19 +0200
Subject: [PATCH 051/204] Small Twitter Explorer style change

---
 webtool/static/css/explorer/twitter.css | 1 +
 1 file changed, 1 insertion(+)

diff --git a/webtool/static/css/explorer/twitter.css b/webtool/static/css/explorer/twitter.css
index 6ad1aeeb1..392531c26 100644
--- a/webtool/static/css/explorer/twitter.css
+++ b/webtool/static/css/explorer/twitter.css
@@ -63,6 +63,7 @@
 .posts .post .media-container img {
 	width: 100%;
 	border-radius: 15px;
+	border: 1px solid #e1e7ea;
 }
 
 .metrics {

From e3f66e894f9389d8f3a514e1f879a344d0df20a4 Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Tue, 11 Jun 2024 19:26:44 +0200
Subject: [PATCH 052/204] Only get and set annotations for top-level datasets

---
 common/lib/dataset.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/common/lib/dataset.py b/common/lib/dataset.py
index e35999ecb..7ac5bd49a 100644
--- a/common/lib/dataset.py
+++ b/common/lib/dataset.py
@@ -338,11 +338,13 @@ def iterate_items(self, processor=None, warn_unmappable=True, map_missing="defau
 		if own_processor and own_processor.map_item_method_available(dataset=self):
 			item_mapper = True
 
-		# Annotation fields are dynamically added,
+		# Annotation fields are dynamically added to top-level datasets,
 		# so we're always going to accept these.
-		annotation_fields = self.get_annotation_fields()
+		annotation_fields = None
+		if self.is_top_dataset():
+			annotation_fields = self.get_annotation_fields()
 
-    # missing field strategy can be for all fields at once, or per field
+    	# missing field strategy can be for all fields at once, or per field
 		# if it is per field, it is a dictionary with field names and their strategy
 		# if it is for all fields, it is may be a callback, 'abort', or 'default'
 		default_strategy = "default"
@@ -821,7 +823,7 @@ def get_annotation_fields(self):
 		:return dict: The saved annotation fields.
 		"""
 
-		annotation_fields = self.db.fetchone("SELECT annotation_fields FROM datasets WHERE key = %s;", (self.top_parent().key,))
+		annotation_fields = self.db.fetchone("SELECT annotation_fields FROM datasets WHERE key = %s;", (self.key,))
 		
 		if annotation_fields and annotation_fields.get("annotation_fields"):
 			annotation_fields = json.loads(annotation_fields["annotation_fields"])
@@ -836,7 +838,7 @@ def get_annotations(self):
 		return dict: The annotations
 		"""
 
-		annotations = self.db.fetchone("SELECT annotations FROM annotations WHERE key = %s;", (self.top_parent().key,))
+		annotations = self.db.fetchone("SELECT annotations FROM annotations WHERE key = %s;", (self.key,))
 
 		if annotations and annotations.get("annotations"):
 			return json.loads(annotations["annotations"])

From ae73d9c3e341bcc7776c6f5b7a80f3cd441b583f Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Fri, 14 Jun 2024 16:06:31 +0200
Subject: [PATCH 053/204] LinkedIn Explorer template pt.1

---
 datasources/linkedin/search_linkedin.py       | 28 ++++++++++++++++++-
 webtool/lib/template_filters.py               |  6 +++-
 webtool/static/css/explorer/linkedin.css      |  0
 .../datasource-templates/linkedin.html        | 21 ++++++++++++++
 4 files changed, 53 insertions(+), 2 deletions(-)
 create mode 100644 webtool/static/css/explorer/linkedin.css
 create mode 100644 webtool/templates/explorer/datasource-templates/linkedin.html

diff --git a/datasources/linkedin/search_linkedin.py b/datasources/linkedin/search_linkedin.py
index 34940243f..e0b2467df 100644
--- a/datasources/linkedin/search_linkedin.py
+++ b/datasources/linkedin/search_linkedin.py
@@ -78,10 +78,22 @@ def map_item(item):
                 images.append(url)
 
         # or alternatively they are stored here:
-        if not images and item["content"] and item["content"]["articleComponent"] and item["content"]["articleComponent"].get("largeImage"):
+        if not images and item["content"] and item["content"].get("articleComponent") and item["content"]["articleComponent"].get("largeImage"):
             image = item["content"]["articleComponent"]["largeImage"]["attributes"][0]["detailData"]["vectorImage"]
             images.append(image["rootUrl"] + image["artifacts"][0]["fileIdentifyingUrlPathSegment"])
 
+        # video thumbnails are stored similarly as image data
+        video_thumb_url = ""
+        thumb_content = None
+        if item["content"] and "*videoPlayMetadata" in item["content"]:
+            thumb_content = item["content"]["*videoPlayMetadata"]["thumbnail"]
+        elif item["content"] and "linkedInVideoComponent" in item["content"] and item["content"]["linkedInVideoComponent"]:
+            thumb_content = item["content"]["linkedInVideoComponent"]["*videoPlayMetadata"]["thumbnail"]
+        elif item["content"] and "externalVideoComponent" in item["content"] and item["content"]["externalVideoComponent"]:
+            thumb_content = item["content"]["externalVideoComponent"]["*videoPlayMetadata"]["thumbnail"]
+        if thumb_content:
+            video_thumb_url = thumb_content["rootUrl"] + thumb_content["artifacts"][0]["fileIdentifyingUrlPathSegment"]
+
         author = SearchLinkedIn.get_author(item)
 
         # the ID is in the format 'urn:li:activity:6960882777168695296'
@@ -101,6 +113,17 @@ def map_item(item):
         elif item["commentary"] and "attributesV2" in item["commentary"]["text"]:
             hashtags = [tag["detailData"]["*hashtag"]["trackingUrn"].split(":").pop() for tag in item["commentary"]["text"].get("attributesV2", []) if "*hashtag" in tag["detailData"]]
 
+        # and mentions
+        # we're storing both usernames and full names
+        author_mentions = []
+        author_name_mentions = []
+        if item["commentary"] and "attributes" in item["commentary"]["text"]:
+            for mention in item["commentary"]["text"].get("attributes", {}):
+                if mention["type"] == "PROFILE_MENTION":
+                    mention = mention["*miniProfile"]
+                    author_mentions.append(mention["publicIdentifier"])         
+                    author_name_mentions.append(" ".join([mention.get("firstName", ""), mention.get("lastName", "")]))
+
         # same for metrics
         if "*totalSocialActivityCounts" in item["*socialDetail"]:
             metrics = {
@@ -131,8 +154,11 @@ def map_item(item):
             "timestamp_ago": time_ago.split("•")[0].strip(),
             "is_promoted": "yes" if not re.findall(r"[0-9]", time_ago) else "no",
             **{("author_" + k).replace("_username", ""): v for k, v in author.items()},
+            "author_mentions": ",".join(author_mentions),
+            "author_name_mentions": ",".join(author_name_mentions),
             "hashtags": ",".join(hashtags),
             "image_urls": ",".join(images),
+            "video_thumb_url": video_thumb_url,
             "post_url": "https://www.linkedin.com/feed/update/" + urn,
             "link_url":  link_url,
             **metrics,
diff --git a/webtool/lib/template_filters.py b/webtool/lib/template_filters.py
index 7f1940449..108241d93 100644
--- a/webtool/lib/template_filters.py
+++ b/webtool/lib/template_filters.py
@@ -186,7 +186,7 @@ def _jinja2_filter_social_mediafy(body, datasource=""):
 		return body
 
 	# Supported data sources
-	known_datasources = ["twitter", "tiktok", "instagram", "tumblr"]
+	known_datasources = ["twitter", "tiktok", "instagram", "tumblr", "linkedin"]
 	if datasource not in known_datasources:
 		return body
 
@@ -207,6 +207,10 @@ def _jinja2_filter_social_mediafy(body, datasource=""):
 		"tumblr": {
 			"hashtag": "https://tumblr.com/tagged/",
 			"mention": "https://tumblr.com/"
+		},
+		"linkedin": {
+			"hashtag": "https://linkedin.com/feed/hashtag/?keywords=",
+			"mention": "https://linkedin.com/in/"
 		}
 	}
 
diff --git a/webtool/static/css/explorer/linkedin.css b/webtool/static/css/explorer/linkedin.css
new file mode 100644
index 000000000..e69de29bb
diff --git a/webtool/templates/explorer/datasource-templates/linkedin.html b/webtool/templates/explorer/datasource-templates/linkedin.html
new file mode 100644
index 000000000..b00931c1d
--- /dev/null
+++ b/webtool/templates/explorer/datasource-templates/linkedin.html
@@ -0,0 +1,21 @@
+<header>
+	{% if post.inclusion_context %}
+		<div class="inclusion-context">{{ post.inclusion_context }}</div>
+	{% endif %}
+
+	{% if not pseudonymised %}
+		<!-- Possible external link, if not pseudonymised -->
+		<a href="{{ post.post_url }}" target="_blank"><span class="external-url" title="Go to original post"><i class="fas fa-external-link-alt"></i></span></a>
+		<!-- Author and co-author -->
+		<div class="author"><strong><a href="https://linkedin.com/{{ post.get('author') }}">{{ post.get("author_name") }}</strong></a></div>
+		<div class="author-description">{{ post.get("author_description") }}</div>
+		<div class="time">{{ post.get("timestamp") }}</div>
+	{% else %}
+		<span title="Pseudonymous author" class="author">
+		<i class="fa fa-user-secret tooltip-trigger"></i>
+	{% endif %}
+</header>
+
+<div class="post-content">
+	<div class="body">{{ post.get("body") | social_mediafy(datasource='linkedin') | safe }}</div>
+</div>
\ No newline at end of file

From f4159cc421b3dfa69ffc291dd0f8c39333f5bc48 Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Fri, 14 Jun 2024 17:57:06 +0200
Subject: [PATCH 054/204] LinkedIn Explorer template pt.2

---
 datasources/linkedin/search_linkedin.py | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/datasources/linkedin/search_linkedin.py b/datasources/linkedin/search_linkedin.py
index e0b2467df..56a08461b 100644
--- a/datasources/linkedin/search_linkedin.py
+++ b/datasources/linkedin/search_linkedin.py
@@ -123,21 +123,36 @@ def map_item(item):
                     mention = mention["*miniProfile"]
                     author_mentions.append(mention["publicIdentifier"])         
                     author_name_mentions.append(" ".join([mention.get("firstName", ""), mention.get("lastName", "")]))
+                elif mention["type"] == "COMPANY_NAME":
+                    mention = mention["*miniCompany"]
+                    author_mentions.append(mention["universalName"])         
+                    author_name_mentions.append(mention.get("name", ""))
 
         # same for metrics
         if "*totalSocialActivityCounts" in item["*socialDetail"]:
             metrics = {
-                "likes": item["*socialDetail"]["*totalSocialActivityCounts"]["numLikes"],
                 "comments": item["*socialDetail"]["*totalSocialActivityCounts"]["numComments"],
-                "shares": item["*socialDetail"]["*totalSocialActivityCounts"]["numShares"]
-            }
+                "shares": item["*socialDetail"]["*totalSocialActivityCounts"]["numShares"],
+                "reactions": item["*socialDetail"]["*totalSocialActivityCounts"]["numLikes"],
+                "reaction_like": 0,
+                "reaction_empathy": 0,
+                "reaction_praise": 0,
+                "reaction_entertainment": 0,
+                "reaction_appreciation": 0,
+                "reaction_interest": 0
+                }
+            # There's different kind of reaction metrics
+            for reaction_type in item["*socialDetail"]["*totalSocialActivityCounts"].get("reactionTypeCounts", []):
+                metrics["reaction_" + reaction_type["reactionType"].lower()] = reaction_type["count"]
+
         else:
             metrics = {
-                "likes": item["*socialDetail"]["likes"]["paging"]["total"],
                 "comments": item["*socialDetail"]["comments"]["paging"]["total"],
                 "shares": item["*socialDetail"]["totalShares"],
+                "reactions": item["*socialDetail"]["likes"]["paging"]["total"]
             }
 
+
         # and links
         link_url = ""
         if item.get("content") and item["content"].get("navigationContext"):

From 70a7767cfa59ecf9b03f422324e98908bba52d41 Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Mon, 17 Jun 2024 16:25:35 +0200
Subject: [PATCH 055/204] LinkedIn Explorer template pt.3

---
 datasources/linkedin/search_linkedin.py       |   1 -
 webtool/static/css/explorer/linkedin.css      | 162 ++++++++++++++++++
 .../linkedin_reaction_appreciation.svg        |  20 +++
 .../linkedin_reaction_empathy.svg             |   9 +
 .../linkedin_reaction_entertainment.svg       |  10 ++
 .../linkedin_reaction_interest.svg            |  11 ++
 .../linkedin_reaction_like.svg                |   8 +
 .../linkedin_reaction_praise.svg              |  27 +++
 .../datasource-templates/linkedin.html        |  86 ++++++++--
 9 files changed, 316 insertions(+), 18 deletions(-)
 create mode 100644 webtool/static/explorer-assets/linkedin_reaction_appreciation.svg
 create mode 100644 webtool/static/explorer-assets/linkedin_reaction_empathy.svg
 create mode 100644 webtool/static/explorer-assets/linkedin_reaction_entertainment.svg
 create mode 100644 webtool/static/explorer-assets/linkedin_reaction_interest.svg
 create mode 100644 webtool/static/explorer-assets/linkedin_reaction_like.svg
 create mode 100644 webtool/static/explorer-assets/linkedin_reaction_praise.svg

diff --git a/datasources/linkedin/search_linkedin.py b/datasources/linkedin/search_linkedin.py
index 56a08461b..53d61a707 100644
--- a/datasources/linkedin/search_linkedin.py
+++ b/datasources/linkedin/search_linkedin.py
@@ -152,7 +152,6 @@ def map_item(item):
                 "reactions": item["*socialDetail"]["likes"]["paging"]["total"]
             }
 
-
         # and links
         link_url = ""
         if item.get("content") and item["content"].get("navigationContext"):
diff --git a/webtool/static/css/explorer/linkedin.css b/webtool/static/css/explorer/linkedin.css
index e69de29bb..a71adb17a 100644
--- a/webtool/static/css/explorer/linkedin.css
+++ b/webtool/static/css/explorer/linkedin.css
@@ -0,0 +1,162 @@
+* {
+	color: black;
+}
+
+body {
+	background-color: rgb(244, 242, 238);
+}
+
+.posts li.post {
+	position: relative;
+	margin: 0 auto;
+	margin-top: 5px;
+	margin-bottom: 5px;
+	padding: 10px;
+	max-width: 555px;
+	background-color: white;
+	color: #2d2d2d;
+	line-height: 1.5;
+	font-family: "Segoe UI", Roboto, Helvetica, Arial, sans-serif;
+	font-size: 14px;
+	border: 1px solid rgb(227, 226, 223);
+	border-radius: 9px;
+	list-style-type: none;
+}
+
+.posts li.post a {
+	color: #1069c3;
+	text-decoration: none;
+	font-weight: bold;
+}
+
+header .inclusion-context {
+	font-size: 12px;
+	padding: 5px;
+	padding-bottom: 12px;
+	margin-bottom: 5px;
+	border-bottom: 1px solid #F0F0F0;
+	color: #707070;
+}
+
+/* Author info  */
+
+header .metadata {
+	margin-top: 5px;
+	margin-bottom: 10px;
+}
+
+header .author-avatar {
+	float: left;
+}
+
+header .author-avatar img {
+	width: 48px;
+	border-radius: 100%;
+	padding: 5px;
+	display: inline-block;
+}
+
+header .author-name a {
+	color: black;
+	font-size: 14px;
+}
+
+header .author .author-description, header .author .time, .metrics span {
+	font-size: 12px;
+	color: #707070;
+}
+
+.posts .external-url {
+    position: absolute;
+    top: 0;
+    right: 0;
+    padding: 10px;
+}
+
+/* Media  */
+.posts li.post .media-container {
+	position: relative;
+	width: 100%;
+	height: 100%;
+	margin-top: 10px;
+	margin-bottom: 5px;
+}
+
+.posts li.post .media-container img {
+	width: 49.5%;
+	display: inline-block;
+}
+
+.posts li.post .media-container img:first-of-type {
+	width: 100%;
+}
+
+.video-thumb {
+	
+}
+
+.play-button {
+	width: 100%;
+	height: 100%;
+	position: absolute;
+	top: 50%;
+    left: 45%;
+	font-size: 80px;
+}
+
+.play-button i {
+	color: white;
+	opacity: .7;
+}
+
+/* Metrics */
+
+.metrics {
+	margin-top: 14px;
+}
+
+.metrics img {
+	height: 16px;
+	vertical-align: middle;
+}
+
+.metrics .shares-and-comments {
+	float: right;
+}
+
+
+/** --------------------- *
+     Annotation post elements
+  * --------------------- */
+.post-annotations {
+	background-color: #f2f2f2;
+	margin-top: 10px;
+	border-radius: 8px;
+}
+
+.post-annotation {
+    padding: 15px;
+}
+
+.post-annotation > .annotation-label {
+    display: inline-block;
+    vertical-align: middle;
+    text-align: right;
+    min-width: 150px;
+    margin-right: 5px;
+    line-height: 1.6em;
+    overflow-x: hidden;
+}
+
+.post-annotation.checkbox > .post-annotation-options {
+    display: inline-block;
+}
+
+.post-annotation-options {
+    display: inline-block;
+    vertical-align: top;
+}
+
+.post-annotation-options > input {
+    display: inline-block;
+}
\ No newline at end of file
diff --git a/webtool/static/explorer-assets/linkedin_reaction_appreciation.svg b/webtool/static/explorer-assets/linkedin_reaction_appreciation.svg
new file mode 100644
index 000000000..f8f1c1786
--- /dev/null
+++ b/webtool/static/explorer-assets/linkedin_reaction_appreciation.svg
@@ -0,0 +1,20 @@
+<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" id="support-consumption-medium" data-supported-dps="24x24">
+  <defs>
+    <mask id="reactions-support-consumption-medium-a" x="0" y="0" width="24" height="24" maskUnits="userSpaceOnUse">
+      <path d="M12 23A11 11 0 101 12a11 11 0 0011 11z" fill="#fff"/>
+    </mask>
+  </defs>
+  <g mask="url(#reactions-support-consumption-medium-a)">
+    <circle cx="12" cy="12" r="12" fill="none"/>
+    <path d="M12 23A11 11 0 101 12a11 11 0 0011 11z" fill="#d8d8d8"/>
+    <path d="M12 23A11 11 0 101 12a11 11 0 0011 11z" fill="#bba9d1"/>
+    <path d="M8.36 14.28H8c-.36-.13-2.16-.82-3.38-1.16a.39.39 0 01-.28-.33 1.06 1.06 0 01.21-.79.65.65 0 01.52-.26 1.06 1.06 0 01.31 0 2.73 2.73 0 01.66.39l.09.07.56.39.52.37 1.31.52c.18.08.92.42.87.6s-.83.17-1 .17z" fill="#fde7ff"/>
+    <path d="M5.09 11.93a1 1 0 01.24.05 3.1 3.1 0 01.7.42l1.11.77 2.19.94a5.18 5.18 0 01-1.18 0H8.1c-.49-.19-2.2-.83-3.39-1.16a.18.18 0 01-.13-.15.87.87 0 01.13-.63.42.42 0 01.37-.17zm0-.39a.88.88 0 00-.7.34 1.3 1.3 0 00-.2.92.6.6 0 00.43.52c1.22.34 3 1 3.36 1.15a.58.58 0 00.19 0h.25c.27 0 2.46.28 2.52 0-.43-.25-2-1-2.29-1.18l-1.3-.52-.52-.36a5 5 0 01-.53-.38L6.2 12a2.47 2.47 0 00-.72-.42 1.15 1.15 0 00-.37-.06z" style="isolation:isolate" fill="#fce2ba" opacity=".23"/>
+    <path d="M21.23 19.91a33.64 33.64 0 01-5.3-.53h-.1a29.14 29.14 0 01-3.93-.81c-1.15-.35-2.28-.8-3.37-1.23L8 17.16c-1-.41-1.87-.75-2.64-1.08L5.11 16a10.91 10.91 0 01-1.34-.63C3.17 15 3 14.53 3.25 14a.87.87 0 01.86-.5h.07a1.13 1.13 0 01.26 0 28.83 28.83 0 014.25 1.36l1.78.06 4 .14a7.3 7.3 0 00-3-1.28c-.37-.1-.71-.2-.78-.47a1.06 1.06 0 01.35-1.13 1.44 1.44 0 01.84-.2 5.19 5.19 0 011 .13 4.24 4.24 0 01.53.16 5.6 5.6 0 001.29.28 15.14 15.14 0 012.2.45c2.24.6 2.68 1.72 3 2.4-.09-.27 0-.07 0-.17l.06-.09h1.48c.18 0 1-.21 1 0a9.33 9.33 0 01-1 4.7.29.29 0 01-.21.16z" fill="#eae2f3"/>
+    <path d="M20.18 15.13a.17.17 0 00.14 0z" fill="#d67676"/>
+    <path d="M8.93 4.64a2.23 2.23 0 00-3.08 0 2.15 2.15 0 000 3.07L9.17 11l3.36-3.3a2.14 2.14 0 000-3.06A2.2 2.2 0 0011 4a2.23 2.23 0 00-1.55.63l-.24.25z" fill="#ecaa96" fill-rule="evenodd"/>
+    <path d="M22.89 15.31a4.45 4.45 0 01-.19 2.55A5.34 5.34 0 0121.42 20a85.21 85.21 0 01-8.62-1c-1.58-.27-9.31-3.65-9.61-3.82a1.13 1.13 0 01.52-1.67c.85-.25 3 1.12 4.41 1.25s4.5.25 5.65.25-1-.8-1.59-1-1.48-.4-1.67-1 .42-1 .93-1a14.1 14.1 0 012.72.67 9.22 9.22 0 013.61.61 4.2 4.2 0 012.16 2c.16.34 2.79-.11 2.96.02z" fill="none" stroke="#493d57"/>
+    <path d="M4.11 13.47C3.75 12.5 4.33 12 5 12s1.35.61 2.45 1.3A27.28 27.28 0 0011 15" fill="none" stroke="#493d57"/>
+    <path d="M8.74 5.13a2.23 2.23 0 00-3.09 0 2.15 2.15 0 000 3L9 11.5l3.37-3.3A2.15 2.15 0 0013 6.31a2.19 2.19 0 00-1.21-1.59 2.14 2.14 0 00-1-.22 2.22 2.22 0 00-1.56.64L9 5.38z" fill="none" stroke="#77280c" stroke-linecap="round" stroke-linejoin="round"/>
+  </g>
+</svg>
diff --git a/webtool/static/explorer-assets/linkedin_reaction_empathy.svg b/webtool/static/explorer-assets/linkedin_reaction_empathy.svg
new file mode 100644
index 000000000..1dd57ca73
--- /dev/null
+++ b/webtool/static/explorer-assets/linkedin_reaction_empathy.svg
@@ -0,0 +1,9 @@
+<svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" viewBox="0 0 16 16" id="empathy-consumption-small" data-supported-dps="16x16">
+  <g>
+    <path d="M8 0a8 8 0 018 8 8 8 0 01-8 8 8 8 0 01-8-8 8 8 0 018-8z" fill="none"/>
+    <circle cx="8" cy="8" r="7" fill="#df704d"/>
+    <path d="M7.71 5A2.64 2.64 0 004 8.75l4 4 4-4A2.64 2.64 0 0012 5a2.61 2.61 0 00-1.85-.77h0A2.57 2.57 0 008.3 5l-.3.3z" fill="#fff3f0" stroke="#77280c" fill-rule="evenodd"/>
+    <path d="M11.43 5.18a2 2 0 01.53.63c.9 1.67-.6 2.72-1.54 3.67-.6.61-1.22 1.22-1.85 1.8M5.79 4.81a2.1 2.1 0 00-.79.11 1.8 1.8 0 00-1 .82A2.6 2.6 0 003.77 7v.09" fill="none"/>
+    <path d="M7.71 5A2.6 2.6 0 004 5a2.66 2.66 0 000 3.7l4 4 4-4A2.66 2.66 0 0012 5a2.58 2.58 0 00-1.85-.78h0A2.58 2.58 0 008.3 5l-.3.25z" fill="none" stroke="#77280c" stroke-linecap="round" stroke-linejoin="round"/>
+  </g>
+</svg>
diff --git a/webtool/static/explorer-assets/linkedin_reaction_entertainment.svg b/webtool/static/explorer-assets/linkedin_reaction_entertainment.svg
new file mode 100644
index 000000000..ebaa24308
--- /dev/null
+++ b/webtool/static/explorer-assets/linkedin_reaction_entertainment.svg
@@ -0,0 +1,10 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<svg id="Layer_1" data-name="Layer 1" xmlns="http://www.w3.org/2000/svg" width="16" height="16" viewBox="0 0 16 16">
+  <circle cx="8" cy="8" r="7.5" style="fill: #44bfd3; stroke: #fff;"/>
+  <circle cx="8" cy="8" r="5" style="fill: #d5f9fe; stroke: #104e58;"/>
+  <path d="M10.1,8.6H5.9c-.29,0-.5,.26-.41,.52,.33,.94,.98,2.18,2.51,2.18s2.18-1.24,2.51-2.18c.09-.26-.12-.52-.41-.52Z" style="fill: #2199ac;"/>
+  <path d="M8,9.8c-.79,0-2,.5-1.5,1,.5,.5,.98,.49,1.5,.5,.45,.01,1,0,1.5-.5,.5-.5-.74-1-1.5-1Z" style="fill: #d5f9fe;"/>
+  <path d="M10.12,8.3H5.88c-.47,0-.83,.47-.66,.93,.15,.4,.38,.96,.81,1.43,.44,.48,1.08,.84,1.97,.84s1.52-.36,1.97-.84c.43-.46,.66-1.02,.81-1.43,.17-.47-.18-.93-.66-.93Zm-2.12,2.7c-1.47,0-2.03-1.16-2.31-1.94-.05-.13,.05-.26,.19-.26h4.24c.14,0,.23,.13,.19,.26-.28,.77-.84,1.94-2.31,1.94Z" style="fill: #104e58;"/>
+  <path d="M4.87,6.63l-.07,.1,.32,.37,.14-.08c.57-.32,1.24-.42,1.88-.28l.2-.44c-.78-.58-1.88-.43-2.47,.34Z" style="fill: #104e58;"/>
+  <path d="M11.19,6.88h0l.2-.15-.07-.1c-.59-.77-1.69-.92-2.47-.34l.15,.2h0l-.15-.2,.2,.44c.64-.13,1.31-.03,1.88,.28l.14,.08,.32-.37-.2,.15Z" style="fill: #104e58;"/>
+</svg>
\ No newline at end of file
diff --git a/webtool/static/explorer-assets/linkedin_reaction_interest.svg b/webtool/static/explorer-assets/linkedin_reaction_interest.svg
new file mode 100644
index 000000000..a18c215c0
--- /dev/null
+++ b/webtool/static/explorer-assets/linkedin_reaction_interest.svg
@@ -0,0 +1,11 @@
+<svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" viewBox="0 0 16 16" id="interest-consumption-ring-small" data-supported-dps="16x16">
+  <g>
+    <circle cx="8" cy="8" r="7.5" fill="#f5bb5c"/>
+    <path d="M8 1a7 7 0 11-7 7 7 7 0 017-7zm0-1a8 8 0 105.66 2.34A8 8 0 008 0z" fill="#fff"/>
+    <path d="M8.82 13.4h-1.6a.54.54 0 01-.54-.54v-1.33h2.68v1.33a.54.54 0 01-.54.54z" fill="#ffe1b2" fill-rule="evenodd"/>
+    <path d="M6.69 11.79v-.26a3.08 3.08 0 00-.16-1A3.46 3.46 0 006 9.75a3.24 3.24 0 01-1.19-2.49 3.21 3.21 0 016.42 0A3.38 3.38 0 0110 9.8c.07-.05-.08.06-.18.2a1.71 1.71 0 00-.23.47 3.37 3.37 0 00-.15 1v.26" fill="#fcf0de" fill-rule="evenodd"/>
+    <path d="M7.46 4.78a2.21 2.21 0 00-1.22.65 2.43 2.43 0 00-.68 1.22" fill="none" stroke="#fff" stroke-linecap="round" stroke-linejoin="round" stroke-width="2"/>
+    <path d="M8.82 13.4h-1.6a.54.54 0 01-.54-.54v-1.33h2.68v1.33a.54.54 0 01-.54.54z" fill="none" stroke="#5d3b01" stroke-linecap="round" stroke-linejoin="round"/>
+    <path d="M6.68 11.79v-.26a3.37 3.37 0 00-.15-1 2 2 0 00-.26-.47 2.54 2.54 0 00-.37-.43 3.41 3.41 0 01-.37-.39 3.16 3.16 0 01-.72-2h0a3.21 3.21 0 016.42 0 3.25 3.25 0 01-.73 2 3.84 3.84 0 01-.57.57l-.2.21a1.68 1.68 0 00-.22.47 3.37 3.37 0 00-.15 1v.26M4.6 2.64l.61.79M11.42 2.63l-.61.8M8 1.5v1.26" fill="none" stroke="#5d3b01" stroke-linecap="round" stroke-linejoin="round"/>
+  </g>
+</svg>
diff --git a/webtool/static/explorer-assets/linkedin_reaction_like.svg b/webtool/static/explorer-assets/linkedin_reaction_like.svg
new file mode 100644
index 000000000..37fd40d4a
--- /dev/null
+++ b/webtool/static/explorer-assets/linkedin_reaction_like.svg
@@ -0,0 +1,8 @@
+<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" id="like-consumption-medium" data-supported-dps="24x24">
+  <g>
+    <path d="M12 0a12 12 0 0112 12 12 12 0 01-12 12A12 12 0 010 12 12 12 0 0112 0z" fill="none"/>
+    <circle cx="12" cy="12" r="11" fill="#378fe9"/>
+    <path d="M11.71 9.54H5.88A1.37 1.37 0 004.5 11 1.43 1.43 0 006 12.34h.25a1.25 1.25 0 00-.1 2.5 1.25 1.25 0 00.52 2.23 1.23 1.23 0 00-.13.88 1.33 1.33 0 001.33 1h3.6a5.54 5.54 0 001.4-.18l2.26-.66h3c1.58-.06 2-7.29 0-7.29h-.86c-.14 0-.23-.3-.62-.72-.58-.62-1.23-1.42-1.69-1.88a11.19 11.19 0 01-2.68-3.46c-.37-.8-.41-1.17-1.18-1.17a1.22 1.22 0 00-1 1.28c0 .42.09.84.16 1.26a12.52 12.52 0 001.55 3.46" fill="#d0e8ff" fill-rule="evenodd"/>
+    <path d="M11.71 9.54H5.88a1.43 1.43 0 00-1 .43A1.43 1.43 0 006 12.36h.25A1.23 1.23 0 005 13.61a1.25 1.25 0 001.15 1.25 1.22 1.22 0 00-.47 1.28 1.24 1.24 0 001 .94 1.23 1.23 0 00-.13.88 1.33 1.33 0 001.33 1h3.6a6 6 0 001.4-.18l2.26-.66h3c1.58-.05 2-7.28 0-7.28h-.86c-.14 0-.23-.3-.62-.72-.59-.62-1.24-1.43-1.66-1.88a11.19 11.19 0 01-2.68-3.46c-.37-.81-.41-1.2-1.18-1.17a1.15 1.15 0 00-1 1.28c0 .4.05.81.11 1.21a12.12 12.12 0 001.55 3.44" fill="none" stroke="#004182" stroke-linecap="round" stroke-linejoin="round"/>
+  </g>
+</svg>
diff --git a/webtool/static/explorer-assets/linkedin_reaction_praise.svg b/webtool/static/explorer-assets/linkedin_reaction_praise.svg
new file mode 100644
index 000000000..e4a45cf7a
--- /dev/null
+++ b/webtool/static/explorer-assets/linkedin_reaction_praise.svg
@@ -0,0 +1,27 @@
+<svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" viewBox="0 0 16 16" id="praise-consumption-small" data-supported-dps="16x16">
+  <defs>
+    <mask id="reactions-praise-consumption-small-a" x="0" y="0" width="16" height="16" maskUnits="userSpaceOnUse">
+      <path d="M8 1a7 7 0 017 7 7 7 0 01-7 7 7 7 0 01-7-7 7 7 0 017-7z" fill="#fff" fill-rule="evenodd"/>
+    </mask>
+  </defs>
+  <g>
+    <path d="M8 0a8 8 0 018 8 8 8 0 01-8 8 8 8 0 01-8-8 8 8 0 018-8z" fill="none"/>
+    <g>
+      <path d="M8 1a7 7 0 017 7 7 7 0 01-7 7 7 7 0 01-7-7 7 7 0 017-7z" fill="#d8d8d8"/>
+    </g>
+    <g mask="url(#reactions-praise-consumption-small-a)">
+      <circle cx="8" cy="8" r="7" fill="#6dae4f"/>
+      <path d="M8 1a7 7 0 11-7 7 7 7 0 017-7zm0-1a8 8 0 105.66 2.34A8 8 0 008 0z" fill="#fff"/>
+      <path d="M12.13 9.22a9.19 9.19 0 00-.36-2.32A4.29 4.29 0 0110.44 5c-.16-.53-.27-.72-.74-.73a.74.74 0 00-.65.8c0 .24 0 .49.06.72a11.5 11.5 0 00.58 1.92l-4.5-3.38a.75.75 0 00-1.11.07.73.73 0 00.27 1L6.6 7.1l.59.56L3.62 5a.71.71 0 00-.75-.16.69.69 0 00-.46.61.71.71 0 00.36.67L5 7.77l1.35 1-2.9-2.19a.79.79 0 00-.57-.21.8.8 0 00-.54.28c-.31.4-.06.81.26 1.06L4.85 9.4l1.15.85-2.27-1.7a.74.74 0 00-1.09 0 .76.76 0 00.24 1.09l4.1 3c.6.45 2.07.84 2.72.27" fill="none" stroke="#165209" stroke-linecap="round" stroke-linejoin="round" stroke-width="1.5"/>
+      <path d="M12.61 9.9l-.42-.37a6.69 6.69 0 00-.51-2.14A5.73 5.73 0 0110.47 5c-.16-.53-.27-.72-.74-.73a.74.74 0 00-.65.8c0 .24 0 .49.06.72a8.88 8.88 0 00.55 1.84l-.19-.1-4.31-3.31a.75.75 0 00-1.11.07.73.73 0 00-.1.59.71.71 0 00.37.47L6.55 7l.64.51-3.57-2.67a.74.74 0 00-.57-.21.77.77 0 00-.54.27.77.77 0 00-.1.59.74.74 0 00.36.51L5 7.66l1.35 1-2.9-2.18a.75.75 0 00-.57-.22.76.76 0 00-.54.28.73.73 0 00.26 1.06l2.25 1.69 1.15.85-2.27-1.69a.73.73 0 00-.54-.25.77.77 0 00-.55.25.74.74 0 00.24 1.08L7 12.64a2.68 2.68 0 002.08.51 1.15 1.15 0 001.41 0c.6-.46.41-.51.85-1.13a10.92 10.92 0 001.27-2.12z" fill="#dcf0cb" fill-rule="evenodd"/>
+      <path d="M12.13 9.22a9.19 9.19 0 00-.36-2.32A4.29 4.29 0 0110.44 5c-.16-.53-.27-.72-.74-.73a.74.74 0 00-.65.8c0 .24 0 .49.06.72a11.5 11.5 0 00.58 1.92l-4.5-3.38a.75.75 0 00-1.11.07.73.73 0 00.27 1L6.6 7.1l.59.56L3.62 5a.71.71 0 00-.75-.16.69.69 0 00-.46.61.71.71 0 00.36.67L5 7.77l1.35 1-2.9-2.19a.79.79 0 00-.57-.21.8.8 0 00-.54.28c-.31.4-.06.81.26 1.06L4.85 9.4l1.15.85-2.27-1.7a.74.74 0 00-1.09 0 .76.76 0 00.24 1.09l4.1 3a4.48 4.48 0 002.72.62" fill="none" stroke="#165209" stroke-linecap="round" stroke-linejoin="round" stroke-width=".5"/>
+      <path d="M14.77 11.39a2.23 2.23 0 01-.46-.75 3.65 3.65 0 00-.1-.65 2.39 2.39 0 00-.36-1.08 5.85 5.85 0 01-1.21-2.38c-.16-.53-.27-.72-.74-.73a.74.74 0 00-.5.26.73.73 0 00-.15.54 4.37 4.37 0 00.06.72c.18.92.37 1.68.39 1.73L7.41 5.84a.76.76 0 00-.57-.22.72.72 0 00-.54.29.73.73 0 00.26 1l2.25 1.7.68.56-3.6-2.71a.76.76 0 00-.57-.22A.71.71 0 005 7.58l2.25 1.7 1.35 1-2.89-2.19a.73.73 0 00-1.1.08c-.31.4-.07.81.26 1.06l2.25 1.68 1.12.85L6 10.06a.72.72 0 00-1 0 .7.7 0 00-.14.58.74.74 0 00.34.49l4 3a2.74 2.74 0 001.13.5l.58.09a2.48 2.48 0 01.87.29.83.83 0 00.6 0 3.87 3.87 0 001.77-1.29 3.8 3.8 0 00.7-2 1 1 0 000-.42z" fill="none" stroke="#165209" stroke-linecap="round" stroke-linejoin="round" stroke-width="1.5"/>
+      <path d="M14.81 11.34l-.45-.34a6.57 6.57 0 00-.51-2.14 5.85 5.85 0 01-1.21-2.38c-.16-.53-.27-.72-.74-.73a.74.74 0 00-.5.26.73.73 0 00-.15.54 4.37 4.37 0 00.06.72c.18.93.37 1.69.39 1.73L7.41 5.79a.75.75 0 00-1.11.07c-.31.41-.06.81.26 1.06l2.25 1.69.68.56-3.6-2.76a.75.75 0 00-1.11.07c-.31.4-.06.81.26 1.06l2.25 1.69 1.35 1L5.71 8a.72.72 0 00-.57-.21.7.7 0 00-.53.28.72.72 0 00-.12.59.74.74 0 00.38.47l2.25 1.69 1.12.85L6 10a.7.7 0 00-1 0 .71.71 0 00-.16.6.72.72 0 00.36.51l4 3a4.23 4.23 0 002 .59 6.68 6.68 0 00.8.41 3.23 3.23 0 002-1.26 4.93 4.93 0 00.86-2.57z" fill="#ddf6d1" fill-rule="evenodd"/>
+      <path d="M5.14 10.32c.57.43 4.43 3.43 4.89 3.59a2.18 2.18 0 001.47 0 1.6 1.6 0 00.5-.31" fill="none"/>
+      <path d="M14.77 11.39a2.23 2.23 0 01-.46-.75 3.65 3.65 0 00-.1-.65 2.39 2.39 0 00-.36-1.08 5.85 5.85 0 01-1.21-2.38c-.16-.53-.27-.72-.74-.73a.74.74 0 00-.5.26.73.73 0 00-.15.54 4.37 4.37 0 00.06.72c.18.92.37 1.68.39 1.73L7.41 5.84a.76.76 0 00-.57-.22.72.72 0 00-.54.29.73.73 0 00.26 1l2.25 1.7.68.56-3.6-2.71a.76.76 0 00-.57-.22A.71.71 0 005 7.58l2.25 1.7 1.35 1-2.89-2.19a.73.73 0 00-1.1.08c-.31.4-.07.81.26 1.06l2.25 1.68 1.12.85L6 10.06a.72.72 0 00-1 0 .7.7 0 00-.14.58.74.74 0 00.34.49l4 3a2.74 2.74 0 001.13.5l.58.09a2.48 2.48 0 01.87.29.83.83 0 00.6 0 3.87 3.87 0 001.77-1.29 3.8 3.8 0 00.7-2 1 1 0 000-.42z" fill="none" stroke="#165209" stroke-linecap="round" stroke-linejoin="round" stroke-width=".5"/>
+      <path d="M8.83 2.82l-.73.92" fill="none" stroke="#165209" stroke-linecap="round" stroke-linejoin="round"/>
+      <path d="M5.49 1.62l.07 1.2" fill="none" stroke="#165209" stroke-linecap="round" stroke-linejoin="round"/>
+      <path d="M7.54 1.63l-.65 1.56" fill="none" stroke="#165209" stroke-linecap="round" stroke-linejoin="round"/>
+    </g>
+  </g>
+</svg>
\ No newline at end of file
diff --git a/webtool/templates/explorer/datasource-templates/linkedin.html b/webtool/templates/explorer/datasource-templates/linkedin.html
index b00931c1d..68aa70b85 100644
--- a/webtool/templates/explorer/datasource-templates/linkedin.html
+++ b/webtool/templates/explorer/datasource-templates/linkedin.html
@@ -1,21 +1,73 @@
-<header>
-	{% if post.inclusion_context %}
-		<div class="inclusion-context">{{ post.inclusion_context }}</div>
-	{% endif %}
+<div class="post-content">
+	<header>
+		{% if post.inclusion_context %}
+			<div class="inclusion-context">{{ post.inclusion_context }}</div>
+		{% endif %}
 
-	{% if not pseudonymised %}
-		<!-- Possible external link, if not pseudonymised -->
-		<a href="{{ post.post_url }}" target="_blank"><span class="external-url" title="Go to original post"><i class="fas fa-external-link-alt"></i></span></a>
-		<!-- Author and co-author -->
-		<div class="author"><strong><a href="https://linkedin.com/{{ post.get('author') }}">{{ post.get("author_name") }}</strong></a></div>
-		<div class="author-description">{{ post.get("author_description") }}</div>
-		<div class="time">{{ post.get("timestamp") }}</div>
-	{% else %}
-		<span title="Pseudonymous author" class="author">
-		<i class="fa fa-user-secret tooltip-trigger"></i>
-	{% endif %}
-</header>
+		<div class="metadata">
+		{% if not pseudonymised %}
+			<!-- Possible external link, if not pseudonymised -->
+			<a href="{{ post.post_url }}" target="_blank"><span class="external-url" title="Go to original post"><i class="fas fa-external-link-alt"></i></span></a>
+			
+			<!-- Profile pic -->
+			{% if post["author_avatar_url"] %}
+				<div class="author-avatar">
+					<a href="https://linkedin.com/{{ post.get('author') }}" target="_blank">
+						<img src="{{ post.get('author_avatar_url') }}">
+					</a>
+				</div>
+			{% endif %}
+			
+			<!-- Author info and time -->
+			<div class="author">
+				<div class="author-name"><a href="https://linkedin.com/{{ post.get('author') }}" target="_blank"><strong>{{ post.get("author_name") }}</strong></a></div>
+				<div class="author-description">{{ post.get("author_description") }}</div>
+				<div class="time">{{ post.get("timestamp") }}</div>
+			</div>
 
-<div class="post-content">
+		{% else %}
+			<div class="author">
+				<div class="author-name"><span title="Pseudonymous author" class="author"><i class="fa fa-user-secret tooltip-trigger"></i></span></div>
+				<div class="time">{{ post.get("timestamp") }}</div>
+			</div>
+		{% endif %}
+		</div>
+	</header>
+
+	<!-- Post text content -->
 	<div class="body">{{ post.get("body") | social_mediafy(datasource='linkedin') | safe }}</div>
+
+	<!-- Media item -->
+	{% if post["image_urls"] or post["video_thumb_url"] %}
+		<div class="media-container">
+		{% if post["image_urls"] %}
+			{% for image_url in post["image_urls"].split(",") %}
+				<img src="{{ image_url }}">
+			{% endfor %}
+		{% elif post["video_thumb_url"] %}
+			<div class="video-thumb"><img src="{{ post.video_thumb_url }}"></div>
+			<div class="play-button"><i class="fa-solid fa-play"></i></div>
+		{% endif %}
+		</div>
+	{% endif %}
+	
+
+	<!-- Post metrics -->	
+	<div class="metrics">
+		{% for reaction_type in ["reaction_like","reaction_empathy","reaction_praise","reaction_entertainment","reaction_appreciation","reaction_interest"] %}
+			{% if reaction_type in post and post[reaction_type] > 0 %}
+				<span><img src="{{ url_for('static', filename='explorer-assets/linkedin_' + reaction_type + '.svg') }}"></img> {{ post[reaction_type] }}</span>
+			{% endif %}
+		{% endfor %}
+		<span class="shares-and-comments">
+			{% if post.get("shares") and post["shares"] > 0 %}
+				<span class="shares">{{ post.get("shares") | numberify }} reposts</span>
+			{% endif %}
+			{% if (post.get("shares") and post["shares"] > 0) and (post.get("comments") and post["comments"] > 0) %} • {% endif %}
+			{% if post.get("comments") and post["comments"] > 0 %}
+				<span class="comments">{{ post.get("comments") | numberify }} comments</span>
+			{% endif %}
+		</span>
+	</div>
+
 </div>
\ No newline at end of file

From 825bb405f24272fb64ff19adaea0f4a31aaac8b3 Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Mon, 1 Jul 2024 17:44:49 +0200
Subject: [PATCH 056/204] Change file download functionality: Show download
 .csv button if there's map_item() OR annotations available. Never write
 annotations to a dataset straight away, but use iterate_items().

---
 common/lib/dataset.py                         |  37 ++++--
 processors/filtering/write_annotations.py     | 108 ------------------
 webtool/static/js/explorer.js                 |  32 ------
 .../components/result-result-row.html         |  18 +--
 webtool/templates/explorer/controls.html      |   1 -
 webtool/views/views_dataset.py                |  26 +----
 6 files changed, 37 insertions(+), 185 deletions(-)
 delete mode 100644 processors/filtering/write_annotations.py

diff --git a/common/lib/dataset.py b/common/lib/dataset.py
index 7ac5bd49a..7101ff6a7 100644
--- a/common/lib/dataset.py
+++ b/common/lib/dataset.py
@@ -338,11 +338,14 @@ def iterate_items(self, processor=None, warn_unmappable=True, map_missing="defau
 		if own_processor and own_processor.map_item_method_available(dataset=self):
 			item_mapper = True
 
-		# Annotation fields are dynamically added to top-level datasets,
-		# so we're always going to accept these.
+		# Annotations and annotation fields are dynamically added to top-level dataset
+		# and we're handling as 'extra' map_item fields.
 		annotation_fields = None
+		annotations = None
 		if self.is_top_dataset():
-			annotation_fields = self.get_annotation_fields()
+			annotation_fields = self.get_annotation_fields()	
+		if annotation_fields:
+			annotations = self.get_annotations()
 
     	# missing field strategy can be for all fields at once, or per field
 		# if it is per field, it is a dictionary with field names and their strategy
@@ -384,19 +387,31 @@ def iterate_items(self, processor=None, warn_unmappable=True, map_missing="defau
 							mapped_item.data[missing_field] = mapped_item.data[missing_field].value
 						else:
 							raise ValueError("map_missing must be 'abort', 'default', or a callback.")
-
 			else:
 				mapped_item = original_item
-			
-			# Re-add annotation fields to a mapped item.
+
+			# Add possible annotations
 			if annotation_fields:
 				for annotation_field in annotation_fields.values():
-					label = annotation_field["label"]
-					if type(mapped_item) is MappedItem:
-						mapped_item.data[label] = original_item.get(label, "")
-					else:
-						mapped_item[label] = original_item.get(label, "")
 
+					annotation = ""
+					annotation_label = annotation_field["label"]
+
+					# Get the annotation, if available.
+					# We're always handling annotated data as a MappedItem object,
+					# even if no map_item() function is available for the data source.
+					if not isinstance(mapped_item, MappedItem):
+						mapped_item = MappedItem(mapped_item)
+					
+					if annotations and mapped_item.data.get("id", "") in annotations:
+						annotation = annotations[mapped_item.data["id"]].get(annotation_label, "")
+						if isinstance(annotation, list):
+							annotation = ",".join(annotation)
+
+					# We're always adding an annotation value,
+					# as an empty string if it's absent.
+					mapped_item.data[annotation_label] = annotation
+						
 			# yield a DatasetItem, which is a dict with some special properties
 			yield DatasetItem(mapper=item_mapper, original=original_item, mapped_object=mapped_item, **(mapped_item.get_item_data() if type(mapped_item) is MappedItem else mapped_item))
 
diff --git a/processors/filtering/write_annotations.py b/processors/filtering/write_annotations.py
deleted file mode 100644
index ad7fa0e4f..000000000
--- a/processors/filtering/write_annotations.py
+++ /dev/null
@@ -1,108 +0,0 @@
-"""
-Write annotations to a dataset
-"""
-from processors.filtering.base_filter import BasicProcessor
-from common.lib.helpers import UserInput
-from common.lib.exceptions import MapItemException
-
-__author__ = "Sal Hagen"
-__credits__ = ["Sal Hagen"]
-__maintainer__ = "Sal Hagen"
-__email__ = "4cat@oilab.eu"
-
-
-class WriteAnnotations(BasicProcessor):
-	"""
-	Write annotated data from the Explorer to a dataset.
-	"""
-	type = "write-annotations"  # job type ID
-	category = "Filtering"  # category
-	title = "Write annotations"  # title displayed in UI
-	description = "Writes annotations from the Explorer to the existing dataset. Each input field will get a new column."  # description displayed in UI
-
-	options = {
-		"to-lowercase": {
-			"type": UserInput.OPTION_TOGGLE,
-			"default": False,
-			"help": "Convert annotations to lowercase"
-		}
-	}
-
-	@classmethod
-	def is_compatible_with(cls, module=None, user=None):
-		"""
-		Allow processor on CSV files
-
-		:param module: Module to determine compatibility with
-		"""
-		return module.is_top_dataset()
-
-	def process(self):
-		"""
-		Create a generator to iterate through items that can be passed to create either a csv or ndjson. Use
-		`for original_item, mapped_item in self.source_dataset.iterate_mapped_items(self)` to iterate through items
-		and yield `original_item`.
-
-		:return generator:
-		"""
-		# Load annotation fields and annotations
-		annotations = self.dataset.get_annotations()
-		annotation_fields = self.dataset.get_annotation_fields()
-		
-		# If there are no fields or annotations saved, we're done here
-		if not annotation_fields:
-			self.dataset.update_status("This dataset has no annotation fields saved.")
-			self.dataset.finish(0)
-			return 
-		if not annotations:
-			self.dataset.update_status("This dataset has no annotations saved.")
-			self.dataset.finish(0)
-			return
-
-		annotation_labels = [v["label"] for v in annotation_fields.values()]
-
-		to_lowercase = self.parameters.get("to-lowercase", False)
-		annotated_posts = set(annotations.keys())
-		post_count = 0
-		
-		# We first need to get a list of post IDs to create a list of new data.
-		# This is somewhat redundant since we'll have to loop through the dataset
-		# multiple times.
-
-		# Create dictionary with annotation labels as keys and lists of data as values
-		new_data = {annotation_label: [] for annotation_label in annotation_labels}
-
-		for item in self.source_dataset.iterate_items(self, warn_unmappable=True):
-			post_count += 1
-
-			# Do some loops so we have empty data for all annotation fields
-			if str(item["id"]) in annotations:
-
-				for label in annotation_labels:
-					if label in annotations[item["id"]]:
-						annotation = annotations[item["id"]][label]
-
-						# We join lists (checkboxes)
-						if isinstance(annotation, list):
-							annotation = ", ".join(annotation)
-						# Convert to lowercase if indicated
-						if to_lowercase:
-							annotation = annotation.lower()
-
-						new_data[label].append(annotation)
-					else:
-						new_data[label].append("")
-			else:
-				for label in annotation_labels:
-					new_data[label].append("")
-
-			if post_count % 2500 == 0:
-				self.dataset.update_status("Processed %i posts" % post_count)
-				self.dataset.update_progress(post_count / self.source_dataset.num_rows)
-
-		# Write to top dataset
-		for label, values in new_data.items():
-			self.add_field_to_parent(label, values, which_parent=self.source_dataset, update_existing=True)
-		
-		self.dataset.update_status("Annotations written to parent dataset.")
-		self.dataset.finish(self.source_dataset.num_rows)
\ No newline at end of file
diff --git a/webtool/static/js/explorer.js b/webtool/static/js/explorer.js
index f46a0a9e9..059f3e26b 100644
--- a/webtool/static/js/explorer.js
+++ b/webtool/static/js/explorer.js
@@ -118,13 +118,6 @@ const annotations = {
 			}
 		});
 
-		$("#save-to-dataset").on("click", function(){
-			if (!$(this).hasClass("disabled")) {
-				annotations.saveAnnotations();
-				annotations.writeAnnotations();
-			}
-		});
-
 		// Ask whether the next page should be opened without saving annotations
 		$('a > .page').click(function(){
 			if (!$("#save-annotations").hassClass('disabled')) {
@@ -738,31 +731,6 @@ const annotations = {
 		});
 	},
 
-	writeAnnotations: function () {
-		// Write the annotations to the dataset.
-		var dataset_key = $("#dataset-key").text();
-
-		$.ajax({
-			url: getRelativeURL("api/queue-processor/"),
-			method: "POST",
-			data: {"key": dataset_key, "processor": "write-annotations"},
-
-			success: function (response) {
-				console.log(response)
-				if (response == "success") {
-					annotations.disableSaving();
-				}
-				else {
-					console.log(response)
-				}
-			},
-			error: function (error) {
-				console.log(error)
-			}
-		});
-		window.open(getRelativeURL("results/" + dataset_key, "__blank")); 
-	},
-
 	fieldsExist: function(){
 		// Annotation fields are sent by the server
 		// and saved in a script in the header.
diff --git a/webtool/templates/components/result-result-row.html b/webtool/templates/components/result-result-row.html
index 2d7972398..a60162c04 100644
--- a/webtool/templates/components/result-result-row.html
+++ b/webtool/templates/components/result-result-row.html
@@ -17,22 +17,22 @@
     </div>
   {% if dataset.is_finished() and dataset.num_rows > 0 %}
     <ul class="dataset-actions">
-    <li>
-        <a href="{{ url_for('get_result', query_file=dataset.result_file) }}" class=" tooltip-trigger" aria-controls="tooltip-get-result-{{ dataset.key }}">
-            <i class="fas fa-download" aria-hidden="true"></i>
-            {{ dataset.result_file.split(".")[-1] }} ({{ dataset.get_results_path()|filesize }})
-        </a>
-        <p role="tooltip" id="tooltip-get-result-{{ dataset.key }}" aria-hidden="true">Download original data</p>
-    </li>
-    {% if dataset.get_extension() != "csv" and dataset.get_own_processor().map_item %}
+    {% if dataset.get_own_processor().map_item or dataset.get_annotation_fields() %}
         <li>
             <a href="{{ url_for('get_mapped_result', key=dataset.key) }}" class="tooltip-trigger" aria-controls="tooltip-get-mapped-result-{{ dataset.key }}" >
                 <i class="fas fa-download" aria-hidden="true"></i>
                 csv
             </a>
-        <p role="tooltip" id="tooltip-get-mapped-result-{{ dataset.key }}" aria-hidden="true">Download data as CSV w/ 4CAT defined columns</p>
+        <p role="tooltip" id="tooltip-get-mapped-result-{{ dataset.key }}" aria-hidden="true">Download data as CSV w/ 4CAT defined columns and annotations</p>
         </li>
     {% endif %}
+    <li>
+        <a href="{{ url_for('get_result', query_file=dataset.result_file) }}" class=" tooltip-trigger" aria-controls="tooltip-get-result-{{ dataset.key }}">
+            <i class="fas fa-download" aria-hidden="true"></i>
+            Original {{ dataset.result_file.split(".")[-1] }} ({{ dataset.get_results_path()|filesize }})
+        </a>
+        <p role="tooltip" id="tooltip-get-result-{{ dataset.key }}" aria-hidden="true">Download original data as provided by the data source</p>
+    </li>
     {% if (dataset.get_extension() in ("csv", "gexf") or dataset.get_own_processor().map_item) and not __user_config("ui.inline_preview") %}
         <li>
             <a href="{{ url_for('preview_items', key=dataset.key) }}"
diff --git a/webtool/templates/explorer/controls.html b/webtool/templates/explorer/controls.html
index 78a409719..d877097dc 100644
--- a/webtool/templates/explorer/controls.html
+++ b/webtool/templates/explorer/controls.html
@@ -25,7 +25,6 @@ <h2>
 						</li>
 						<li class="annotation-control-button"><a class="button-like-small{% if not annotation_fields %} disabled{% endif %}" id="toggle-annotations"><i class="fas fa-eye"></i> Show annotations</li></a>
 						<li class="annotation-control-button"><a class="button-like-small{% if annotation_fields and annotations %} disabled{% endif %}" id="save-annotations"><i class="fas fa-save"></i> Save annotations</li></a>
-						<li class="annotation-control-button"><a class="button-like-small{% if annotation_fields and annotations %} disabled{% endif %}" id="save-to-dataset"><i class="fa-solid fa-right-from-bracket"></i> Write to dataset</li></a>
 					</ul>
 				</div>
 				
diff --git a/webtool/views/views_dataset.py b/webtool/views/views_dataset.py
index 54735a3b0..0210c43bc 100644
--- a/webtool/views/views_dataset.py
+++ b/webtool/views/views_dataset.py
@@ -177,6 +177,8 @@ def get_mapped_result(key):
     processor of the dataset has a method for mapping its data to CSV, then this
     route uses that to convert the data to CSV on the fly and serve it as such.
 
+    We also use this if there's annotation data saved.
+
     :param str key:  Dataset key
     """
     try:
@@ -188,22 +190,6 @@ def get_mapped_result(key):
             config.get("privileges.can_view_private_datasets") or dataset.is_accessible_by(current_user)):
         return error(403, error="This dataset is private.")
 
-    if dataset.get_extension() == ".csv":
-        # if it's already a csv, just return the existing file
-        return url_for("get_result", query_file=dataset.get_results_path().name)
-
-    if not hasattr(dataset.get_own_processor(), "map_item"):
-        # cannot map without a mapping method
-        return error(404, error="File not found.")
-
-    # Also add possibly added annotation items.
-    # These cannot be added to the static `map_item` function.
-    annotation_labels = None
-    annotation_fields = dataset.get_annotation_fields()
-    if annotation_fields:
-        annotation_labels = [v["label"] for v in annotation_fields.values()]
-        annotations = dataset.get_annotations()
-
     def map_response():
         """
         Yield a CSV file line by line
@@ -217,10 +203,6 @@ def map_response():
         for item in dataset.iterate_items(processor=dataset.get_own_processor(), warn_unmappable=False):
             if not writer:
                 fieldnames = list(item.keys())
-                if annotation_labels:
-                    for label in annotation_labels:
-                        if label not in fieldnames:
-                            fieldnames.append(label)
 
                 writer = csv.DictWriter(buffer, fieldnames=fieldnames)
                 writer.writeheader()
@@ -228,10 +210,6 @@ def map_response():
                 buffer.truncate(0)
                 buffer.seek(0)
 
-            if annotation_fields:
-                for label in annotation_labels:
-                    item[label] = annotations.get(item.get("id"), {}).get(label, "")
-
             writer.writerow(item)
             yield buffer.getvalue()
             buffer.truncate(0)

From 4a8e6941b78da24412a4c95862d5ffa3427b1ad1 Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Tue, 2 Jul 2024 14:44:54 +0200
Subject: [PATCH 057/204] Totally remove empty annotations from database when
 they are removed by user

---
 common/lib/dataset.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/common/lib/dataset.py b/common/lib/dataset.py
index 7101ff6a7..f1b281b13 100644
--- a/common/lib/dataset.py
+++ b/common/lib/dataset.py
@@ -864,7 +864,7 @@ def save_annotation_fields(self, annotation_fields):
 		"""
 		Save the annotation fields of a dataset to the datasets table.
 		If changes to the annotation fields affect older, existing annotations,
-		this function also updates or deleted those values.
+		this function also updates or deletes those values.
 
 		:param dict annotation_fields:  Annotation fields, with a field ID as key
 		:return int:					The number of annotation fields saved.
@@ -1025,6 +1025,14 @@ def save_annotations(self, annotations):
 			for post_id in list(annotations.keys()):
 				old_annotations[post_id] = annotations[post_id]
 
+				# Empty strings, lists, or None as input values get removed
+				fields_to_delete = []
+				for label, values in old_annotations[post_id].items():
+					if not values:
+						fields_to_delete.append(label)
+				for label in fields_to_delete:
+					del old_annotations[post_id][label]
+
 				# Empty lists/dicts get removed
 				if not old_annotations[post_id]:
 					del old_annotations[post_id]

From 90320e9786210aacb040a658599c43b97415b762 Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Tue, 2 Jul 2024 14:45:14 +0200
Subject: [PATCH 058/204] Add annotation labels to dataset metadata box

---
 webtool/templates/components/result-details.html | 16 ++++++++++++++++
 .../templates/components/result-result-row.html  |  8 ++++----
 2 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/webtool/templates/components/result-details.html b/webtool/templates/components/result-details.html
index 24768ffa9..6c3a6a8f2 100644
--- a/webtool/templates/components/result-details.html
+++ b/webtool/templates/components/result-details.html
@@ -110,6 +110,22 @@ <h2 class="blocktitle{% if current_user.is_authenticated and (__user_config("pri
                         <dd>{% include 'components/result-metadata.html' %}</dd>
                     </div>
 
+                    {% set annotation_fields = dataset.get_annotation_fields() %}
+                    {% if annotation_fields %}
+                    <div class="fullwidth">
+                        <dt>Annotations</dt>
+                        <dd>
+                            {% set annotations = dataset.get_annotations() %}
+                            {% if annotations %}
+                                {{ annotations|length|numberify }} item{% if annotations|length > 1 %}s{% endif %} annotated with fields
+                            {% endif %}
+                            {% for annotation_field in annotation_fields.items() %}
+                                <span class="property-badge">{{ annotation_field[1].type }}</span> {{ annotation_field[1].label }}
+                            {% endfor %}
+                        </dd>
+                    </div>
+                    {% endif %}
+
                     {% if has_credentials and current_user.is_authenticated and (__user_config("privileges.admin.can_manipulate_all_datasets") or dataset.is_accessible_by(current_user, "owner")) %}
                     <div class="fullwidth">
                       <dt>API Credentials</dt>
diff --git a/webtool/templates/components/result-result-row.html b/webtool/templates/components/result-result-row.html
index a60162c04..284883c49 100644
--- a/webtool/templates/components/result-result-row.html
+++ b/webtool/templates/components/result-result-row.html
@@ -21,15 +21,15 @@
         <li>
             <a href="{{ url_for('get_mapped_result', key=dataset.key) }}" class="tooltip-trigger" aria-controls="tooltip-get-mapped-result-{{ dataset.key }}" >
                 <i class="fas fa-download" aria-hidden="true"></i>
-                csv
+                Download csv
             </a>
-        <p role="tooltip" id="tooltip-get-mapped-result-{{ dataset.key }}" aria-hidden="true">Download data as CSV w/ 4CAT defined columns and annotations</p>
+        <p role="tooltip" id="tooltip-get-mapped-result-{{ dataset.key }}" aria-hidden="true">Download data as csv with 4CAT-defined columns and/or annotation columns</p>
         </li>
     {% endif %}
     <li>
         <a href="{{ url_for('get_result', query_file=dataset.result_file) }}" class=" tooltip-trigger" aria-controls="tooltip-get-result-{{ dataset.key }}">
             <i class="fas fa-download" aria-hidden="true"></i>
-            Original {{ dataset.result_file.split(".")[-1] }} ({{ dataset.get_results_path()|filesize }})
+            Original data ({{ dataset.result_file.split(".")[-1] }}, {{ dataset.get_results_path()|filesize }})
         </a>
         <p role="tooltip" id="tooltip-get-result-{{ dataset.key }}" aria-hidden="true">Download original data as provided by the data source</p>
     </li>
@@ -40,7 +40,7 @@
                aria-controls="tooltip-preview-{{ dataset.key }} popup-preview-{{ dataset.key }}">
                 <i class="fa fa-eye" aria-hidden="true"></i> Preview
             </a>
-            <p role="tooltip" id="tooltip-preview-{{ dataset.key }}" aria-hidden="true">Preview data as table w/ 4CAT defined columns</p>
+            <p role="tooltip" id="tooltip-preview-{{ dataset.key }}" aria-hidden="true">Preview data as table with 4CAT-defined columns</p>
         <div role="dialog" id="popup-preview-{{ dataset.key }}"></div>
         </li>
     {% endif %}

From 9eb68f2f9af34058a052fe7e88f0b2621380fb12 Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Tue, 2 Jul 2024 15:20:50 +0200
Subject: [PATCH 059/204] ...but keep empty strings in annotations dict so we
 know a field got changed :)

---
 webtool/static/js/explorer.js | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/webtool/static/js/explorer.js b/webtool/static/js/explorer.js
index 059f3e26b..75ee85364 100644
--- a/webtool/static/js/explorer.js
+++ b/webtool/static/js/explorer.js
@@ -679,7 +679,7 @@ const annotations = {
 							val = undefined;
 						}
 					}
-					if ((val != undefined && val != "") || edited) {
+					if ((val != undefined) || edited) {
 						vals_changed = true;
 						post_vals[label] = val;
 					}

From d6c2d218c21e2af1eaa8739a5550615eb2c80c47 Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Tue, 2 Jul 2024 15:24:30 +0200
Subject: [PATCH 060/204] Change label of original data download; ndjson string
 is a bit too large

---
 webtool/templates/components/result-result-row.html | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/webtool/templates/components/result-result-row.html b/webtool/templates/components/result-result-row.html
index 284883c49..822f896c9 100644
--- a/webtool/templates/components/result-result-row.html
+++ b/webtool/templates/components/result-result-row.html
@@ -29,7 +29,7 @@
     <li>
         <a href="{{ url_for('get_result', query_file=dataset.result_file) }}" class=" tooltip-trigger" aria-controls="tooltip-get-result-{{ dataset.key }}">
             <i class="fas fa-download" aria-hidden="true"></i>
-            Original data ({{ dataset.result_file.split(".")[-1] }}, {{ dataset.get_results_path()|filesize }})
+            Original {{ dataset.result_file.split(".")[-1] }} ({{ dataset.get_results_path()|filesize }})
         </a>
         <p role="tooltip" id="tooltip-get-result-{{ dataset.key }}" aria-hidden="true">Download original data as provided by the data source</p>
     </li>

From 596fbe272dbf1b2d2c616a1c268513953d8b3caa Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Tue, 2 Jul 2024 15:27:06 +0200
Subject: [PATCH 061/204] Add Twitter/Zeeschuimer profile banner URL

---
 datasources/twitter-import/search_twitter.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/datasources/twitter-import/search_twitter.py b/datasources/twitter-import/search_twitter.py
index e3e14810f..51df5ef51 100644
--- a/datasources/twitter-import/search_twitter.py
+++ b/datasources/twitter-import/search_twitter.py
@@ -82,7 +82,8 @@ def map_item_modern(tweet):
             "author": tweet["core"]["user_results"]["result"]["legacy"]["screen_name"],
             "author_fullname": tweet["core"]["user_results"]["result"]["legacy"]["name"],
             "author_id": tweet["legacy"]["user_id_str"],
-            "author_profile_img": tweet["core"]["user_results"]["result"]["legacy"]["profile_image_url_https"],
+            "author_avatar_url": tweet["core"]["user_results"]["result"]["legacy"]["profile_image_url_https"],
+            "author_banner_url": tweet["core"]["user_results"]["result"]["legacy"]["profile_banner_url"],
             "verified": tweet["core"]["user_results"]["result"].get("is_blue_verified", ""),
             "source": strip_tags(tweet["source"]),
             "language_guess": tweet["legacy"].get("lang"),

From 42df95822681068d304c10a8692dcb3e1ecb2337 Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Tue, 2 Jul 2024 15:32:59 +0200
Subject: [PATCH 062/204] Add a tooltip that explains Explorer saving behaviour

---
 webtool/templates/explorer/controls.html | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/webtool/templates/explorer/controls.html b/webtool/templates/explorer/controls.html
index d877097dc..2a88ce5d3 100644
--- a/webtool/templates/explorer/controls.html
+++ b/webtool/templates/explorer/controls.html
@@ -24,7 +24,12 @@ <h2>
 							<a class="button-like-small" id="toggle-annotation-fields"><i class="fas fa-edit"></i> Edit fields</a>
 						</li>
 						<li class="annotation-control-button"><a class="button-like-small{% if not annotation_fields %} disabled{% endif %}" id="toggle-annotations"><i class="fas fa-eye"></i> Show annotations</li></a>
-						<li class="annotation-control-button"><a class="button-like-small{% if annotation_fields and annotations %} disabled{% endif %}" id="save-annotations"><i class="fas fa-save"></i> Save annotations</li></a>
+						<li class="annotation-control-button"><a class="button-like-small{% if annotation_fields and annotations %} disabled{% endif %}" id="save-annotations"><i class="fas fa-save"></i> Save annotations</a>
+						<button class="tooltip-trigger" aria-controls="tooltip-save-annotations">?</button>
+						<p role="tooltip" id="tooltip-save-annotations" aria-hidden="true">
+							Annotations are saved every 10 seconds. Use <strong><i class="fas fa-download"></i>Download csv</strong> on the dataset overview page to download a csv with annotations added as columns.
+						</p>
+						</li>
 					</ul>
 				</div>
 				

From 88ab97ab6581c4b3dd5d43617ba3fc2634f4cc40 Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Tue, 2 Jul 2024 16:05:31 +0200
Subject: [PATCH 063/204] Slight edit of tooltip wording

---
 webtool/templates/explorer/controls.html | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/webtool/templates/explorer/controls.html b/webtool/templates/explorer/controls.html
index 2a88ce5d3..f96a9265a 100644
--- a/webtool/templates/explorer/controls.html
+++ b/webtool/templates/explorer/controls.html
@@ -27,7 +27,7 @@ <h2>
 						<li class="annotation-control-button"><a class="button-like-small{% if annotation_fields and annotations %} disabled{% endif %}" id="save-annotations"><i class="fas fa-save"></i> Save annotations</a>
 						<button class="tooltip-trigger" aria-controls="tooltip-save-annotations">?</button>
 						<p role="tooltip" id="tooltip-save-annotations" aria-hidden="true">
-							Annotations are saved every 10 seconds. Use <strong><i class="fas fa-download"></i>Download csv</strong> on the dataset overview page to download a csv with annotations added as columns.
+							Annotations are saved automatically every ten seconds. Use <strong><i class="fas fa-download"></i>Download csv</strong> on the dataset overview page to download a csv with annotations added as columns.
 						</p>
 						</li>
 					</ul>

From b22385eb56e9915b5f4a6786fb8d793d69408442 Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Tue, 2 Jul 2024 16:13:18 +0200
Subject: [PATCH 064/204] Nicely align option fields in the Annotations editor

---
 webtool/static/js/explorer.js | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/webtool/static/js/explorer.js b/webtool/static/js/explorer.js
index 75ee85364..e55dab55b 100644
--- a/webtool/static/js/explorer.js
+++ b/webtool/static/js/explorer.js
@@ -820,7 +820,7 @@ const annotations = {
 		if (id == undefined || id == 0) {
 			id = annotations.randomInt();
 		}
-		return "<dd class='option-field'><input type='text' id='input-" + id + "' placeholder='Value'></dd>";
+		return "<div class='option-field'><input type='text' id='input-" + id + "' placeholder='Value'></div>";
 	},
 
 	randomInt: function(){

From 868af5e1e7b936e6bf67ff29d122c5ebdbe019d8 Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Tue, 2 Jul 2024 17:15:31 +0200
Subject: [PATCH 065/204] Improve saving and deleting annotations; less clutter
 of empty values

---
 common/lib/dataset.py                         | 20 ++++++++++++++-----
 webtool/static/js/explorer.js                 | 13 ++++++------
 webtool/templates/explorer/controls.html      |  2 +-
 .../datasource-templates/twitter.html         |  2 +-
 4 files changed, 23 insertions(+), 14 deletions(-)

diff --git a/common/lib/dataset.py b/common/lib/dataset.py
index f1b281b13..087ad7eb8 100644
--- a/common/lib/dataset.py
+++ b/common/lib/dataset.py
@@ -879,7 +879,7 @@ def save_annotation_fields(self, annotation_fields):
 		old_fields = self.get_annotation_fields()
 
 		# We're saving the new annotation fields as-is
-		self.db.execute("UPDATE datasets SET annotation_fields = %s WHERE key = %s;", (json.dumps(annotation_fields), self.top_parent().key))
+		self.db.execute("UPDATE datasets SET annotation_fields = %s WHERE key = %s;", (json.dumps(annotation_fields), self.key))
 
 		# If new annotation fields change the annotations already saved (e.g. if a field is deleted),
 		# we must also check if we should update annotation data.
@@ -1008,7 +1008,7 @@ def save_annotations(self, annotations):
 		"""
 		Saves annotations for a dataset to the annotations table.
 
-		:param dict annotations:	Annotations dict, with post IDs as keys.	
+		:param dict annotations:	Annotations dict, with post IDs as keys.
 		:return int:				The number of posts with annotations.
 
 		"""
@@ -1018,6 +1018,7 @@ def save_annotations(self, annotations):
 		# We also need to check whether any of the input fields has changed.
 		# If so, we're gonna edit or remove their old values.
 		old_annotations = self.get_annotations()
+		delete_annotations = False
 
 		if old_annotations and annotations:
 			# Loop through all new annotations and add/overwrite them
@@ -1032,19 +1033,28 @@ def save_annotations(self, annotations):
 						fields_to_delete.append(label)
 				for label in fields_to_delete:
 					del old_annotations[post_id][label]
+					delete_annotations = True
 
 				# Empty lists/dicts get removed
 				if not old_annotations[post_id]:
 					del old_annotations[post_id]
+					delete_annotations = True
 
 			annotations = old_annotations
 
-		if not annotations:
+		# If there's nothing to save or delete, do nothing
+		if not annotations and not delete_annotations:
 			return 0
 
-		# We're saving all annotations as a JSON string
+		# If the annotations are empty, remove the row from the annotations table
+		if len(annotations) == 0:
+			self.db.delete("annotations", {"key": self.key})
+			return 0
+
+		# If there's something to add or change,
+		# we're saving all annotations as a JSON string
 		annotations = json.dumps(annotations)
-		self.db.execute("INSERT INTO annotations(key, annotations) VALUES(%s, %s) ON CONFLICT (key) DO UPDATE SET annotations = %s ", (self.top_parent().key, annotations, annotations))
+		self.db.upsert("annotations", {"key": self.key, "annotations": annotations}, constraints=["key"])
 
 		return len(annotations)
 
diff --git a/webtool/static/js/explorer.js b/webtool/static/js/explorer.js
index e55dab55b..c6a9daab7 100644
--- a/webtool/static/js/explorer.js
+++ b/webtool/static/js/explorer.js
@@ -109,7 +109,7 @@ const annotations = {
 		$(".post-annotations").on("click", "option, input[type=checkbox], label", function() { annotations.enableSaving(); edits_made = true;});
 
 		// Keep track of whether the annotations are edited or not.
-		$(".post-annotations").on("change", ".post-annotation-input, .post-annotation input, .post-annotation textarea", function(){$(this).addClass("edited")});
+		$(".post-annotations").on("keydown change", ".post-annotation-input, .post-annotation input, .post-annotation textarea", function(){$(this).addClass("edited")});
 
 		// Save the annotations to the database
 		$("#save-annotations").on("click", function(){
@@ -118,10 +118,10 @@ const annotations = {
 			}
 		});
 
-		// Ask whether the next page should be opened without saving annotations
-		$('a > .page').click(function(){
+		// Save unsaved annotations upon changing a page.
+		$('.page > a').click(function(){
 			if (!$("#save-annotations").hassClass('disabled')) {
-				return confirm("Unsaved annotations are lost if you don't save before leaving the page.\nLeave anyway?");
+				annotations.saveAnnotations();
 			}
 		})
 
@@ -679,7 +679,7 @@ const annotations = {
 							val = undefined;
 						}
 					}
-					if ((val != undefined) || edited) {
+					if ((val != undefined && val != "") || edited) {
 						vals_changed = true;
 						post_vals[label] = val;
 					}
@@ -689,8 +689,7 @@ const annotations = {
 					anns[post_id] = post_vals;
 				}
 			}
-		});
-
+		})
 		
 		$("#save-annotations").html("<i class='fas fa-circle-notch spinner'></i> Saving annotations")
 		annotations.disableSaving();
diff --git a/webtool/templates/explorer/controls.html b/webtool/templates/explorer/controls.html
index f96a9265a..44d816b7f 100644
--- a/webtool/templates/explorer/controls.html
+++ b/webtool/templates/explorer/controls.html
@@ -24,7 +24,7 @@ <h2>
 							<a class="button-like-small" id="toggle-annotation-fields"><i class="fas fa-edit"></i> Edit fields</a>
 						</li>
 						<li class="annotation-control-button"><a class="button-like-small{% if not annotation_fields %} disabled{% endif %}" id="toggle-annotations"><i class="fas fa-eye"></i> Show annotations</li></a>
-						<li class="annotation-control-button"><a class="button-like-small{% if annotation_fields and annotations %} disabled{% endif %}" id="save-annotations"><i class="fas fa-save"></i> Save annotations</a>
+						<li class="annotation-control-button"><a class="button-like-small{% if annotation_fields and annotations %} disabled{% endif %}" id="save-annotations"><i class="fas fa-save"></i> {% if not annotations %}No annotations{% else %}Annotations saved{% endif %}</a>
 						<button class="tooltip-trigger" aria-controls="tooltip-save-annotations">?</button>
 						<p role="tooltip" id="tooltip-save-annotations" aria-hidden="true">
 							Annotations are saved automatically every ten seconds. Use <strong><i class="fas fa-download"></i>Download csv</strong> on the dataset overview page to download a csv with annotations added as columns.
diff --git a/webtool/templates/explorer/datasource-templates/twitter.html b/webtool/templates/explorer/datasource-templates/twitter.html
index 235fee353..12fbf9c5c 100644
--- a/webtool/templates/explorer/datasource-templates/twitter.html
+++ b/webtool/templates/explorer/datasource-templates/twitter.html
@@ -2,7 +2,7 @@
 	<div class="post-table-row">
 		<div class="profile-picture">
 			{% if not pseudonymised %}
-			<img src="{{ post.get('author_profile_img') }}">
+			<img src="{{ post.get('author_avatar_url') }}">
 			{% else %}
 			<span title="Pseudonymous author" class="author"><i class="fa fa-user-secret tooltip-trigger"></i></span>
 			{% endif %}

From afbf897f5b37fd3dd322353767d109320e0a32de Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Wed, 3 Jul 2024 15:04:01 +0200
Subject: [PATCH 066/204] Lead to map_item() download on Datasets overview.
 Only lead to "Original data" when there's no map_item() or annotations.

---
 webtool/templates/results.html | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/webtool/templates/results.html b/webtool/templates/results.html
index 63954220b..8fc92c003 100644
--- a/webtool/templates/results.html
+++ b/webtool/templates/results.html
@@ -65,7 +65,11 @@ <h4>{{ dataset.get_label() }}</h4>
                         {% if not dataset.is_finished() or dataset.num_rows == 0 %}
                             <p class="button-like inactive{% if dataset.progress and dataset.progress > 0 and not dataset.is_finished() %} progress progress-{{ (dataset.progress * 100)|round(0)|int }}{% endif %}"><span class="dataset-status"><span class="result-status">{% include "components/result-status.html" %}</span></span></p>
                         {% else %}
-                            <a class="button-like" href="{{ url_for('get_result', query_file=dataset.result_file) }}"><i class="fas fa-download" aria-hidden="true"></i> {{ dataset.get_results_path()|filesize }}, {{ dataset.result_file.split(".")[-1] }}</a>
+                            {% if dataset.get_own_processor().map_item or dataset.get_annotation_fields() %}
+                            <a class="button-like" href="{{ url_for('get_mapped_result', key=dataset.key) }}"><i class="fas fa-download" aria-hidden="true"></i> Download csv</a>
+                            {% else %}
+                            <a class="button-like" href="{{ url_for('get_result', query_file=dataset.result_file) }}"><i class="fas fa-download" aria-hidden="true"></i> Download {{ dataset.result_file.split(".")[-1] }}</a>
+                            {% endif %}
                         {% endif %}
                         </div>
 

From 007f8bfd09cc225ea3402d706bc73a2285b0742e Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Wed, 3 Jul 2024 19:06:17 +0200
Subject: [PATCH 067/204] Fix config import in Explorer

---
 webtool/views/views_explorer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/webtool/views/views_explorer.py b/webtool/views/views_explorer.py
index 61fb467e6..0963c0083 100644
--- a/webtool/views/views_explorer.py
+++ b/webtool/views/views_explorer.py
@@ -66,7 +66,7 @@ def explorer_dataset(key, page=1):
 		return error(404, error="Explorer functionality disabled for %s." % datasource)
 
 	# The amount of posts to show on a page
-	posts_per_page = config.get("explorer.posts_per_page", 50)
+	posts_per_page = config.get("explorer.__posts_per_page", 50)
 
 	# The amount of posts that may be included (limit for large datasets)
 	max_posts = config.get('explorer.max_posts', 500000)

From dc26b107185b7401f0228581d482f04331715ca6 Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Thu, 4 Jul 2024 15:54:22 +0200
Subject: [PATCH 068/204] Revert changes to parent dataset writing

---
 backend/lib/processor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend/lib/processor.py b/backend/lib/processor.py
index 564351aa6..24b7b4a11 100644
--- a/backend/lib/processor.py
+++ b/backend/lib/processor.py
@@ -418,7 +418,7 @@ def add_field_to_parent(self, field_name, new_data, which_parent=source_dataset,
 		parent_path = which_parent.get_results_path()
 
 		if len(new_data) != which_parent.num_rows:
-			self.dataset.update_status('The amount of new data points and existing records don\'t match; data may be misaligned (parent dataset: %i, new data points: %i)' % (which_parent.num_rows, len(new_data)))
+			raise ProcessorException('Must have new data point for each record: parent dataset: %i, new data points: %i' % (which_parent.num_rows, len(new_data)))
 
 		self.dataset.update_status("Adding new field %s to the source file" % field_name)
 

From 96eb11ec7288937d34bf37fd64c747575ad7f041 Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Thu, 4 Jul 2024 15:54:41 +0200
Subject: [PATCH 069/204] Use correct config name

---
 webtool/views/views_explorer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/webtool/views/views_explorer.py b/webtool/views/views_explorer.py
index 0963c0083..9ed869c0d 100644
--- a/webtool/views/views_explorer.py
+++ b/webtool/views/views_explorer.py
@@ -69,7 +69,7 @@ def explorer_dataset(key, page=1):
 	posts_per_page = config.get("explorer.__posts_per_page", 50)
 
 	# The amount of posts that may be included (limit for large datasets)
-	max_posts = config.get('explorer.max_posts', 500000)
+	max_posts = config.get('explorer.__max_posts', 500000)
 
 	# The offset for posts depending on the current page
 	offset = ((page - 1) * posts_per_page) if page else 0

From 3fd55ca3840c13baac81b27f46d7c108f4cdf794 Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Thu, 4 Jul 2024 16:02:00 +0200
Subject: [PATCH 070/204] Only show dataset download buttons when the file is
 actually there.

---
 common/lib/dataset.py                              | 14 ++++++++++++++
 .../templates/components/result-result-row.html    |  4 ++--
 webtool/templates/results.html                     | 10 ++++++----
 3 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/common/lib/dataset.py b/common/lib/dataset.py
index 087ad7eb8..656f1d007 100644
--- a/common/lib/dataset.py
+++ b/common/lib/dataset.py
@@ -1739,6 +1739,20 @@ def is_from_collector(self):
 		"""
 		return self.type.endswith("-search") or self.type.endswith("-import")
 
+	def file_exists(self):
+		"""
+		Checks whether the result file for this dataset (still) exists.
+		Can be useful for checking the status of old dataset files that
+		may have been deleted. 
+
+		:return bool:
+		"""
+
+		if self.get_results_path().exists():
+			return True
+		
+		return False
+
 	def get_extension(self):
 		"""
 		Gets the file extention this dataset produces.
diff --git a/webtool/templates/components/result-result-row.html b/webtool/templates/components/result-result-row.html
index 822f896c9..253e83c34 100644
--- a/webtool/templates/components/result-result-row.html
+++ b/webtool/templates/components/result-result-row.html
@@ -15,7 +15,7 @@
         </div>
     {% endif %}
     </div>
-  {% if dataset.is_finished() and dataset.num_rows > 0 %}
+  {% if dataset.is_finished() and dataset.num_rows > 0 and dataset.file_exists() %}
     <ul class="dataset-actions">
     {% if dataset.get_own_processor().map_item or dataset.get_annotation_fields() %}
         <li>
@@ -29,7 +29,7 @@
     <li>
         <a href="{{ url_for('get_result', query_file=dataset.result_file) }}" class=" tooltip-trigger" aria-controls="tooltip-get-result-{{ dataset.key }}">
             <i class="fas fa-download" aria-hidden="true"></i>
-            Original {{ dataset.result_file.split(".")[-1] }} ({{ dataset.get_results_path()|filesize }})
+            Original {{ dataset.get_extension() }} ({{ dataset.get_results_path()|filesize }})
         </a>
         <p role="tooltip" id="tooltip-get-result-{{ dataset.key }}" aria-hidden="true">Download original data as provided by the data source</p>
     </li>
diff --git a/webtool/templates/results.html b/webtool/templates/results.html
index 8fc92c003..5b7450c23 100644
--- a/webtool/templates/results.html
+++ b/webtool/templates/results.html
@@ -65,10 +65,12 @@ <h4>{{ dataset.get_label() }}</h4>
                         {% if not dataset.is_finished() or dataset.num_rows == 0 %}
                             <p class="button-like inactive{% if dataset.progress and dataset.progress > 0 and not dataset.is_finished() %} progress progress-{{ (dataset.progress * 100)|round(0)|int }}{% endif %}"><span class="dataset-status"><span class="result-status">{% include "components/result-status.html" %}</span></span></p>
                         {% else %}
-                            {% if dataset.get_own_processor().map_item or dataset.get_annotation_fields() %}
-                            <a class="button-like" href="{{ url_for('get_mapped_result', key=dataset.key) }}"><i class="fas fa-download" aria-hidden="true"></i> Download csv</a>
-                            {% else %}
-                            <a class="button-like" href="{{ url_for('get_result', query_file=dataset.result_file) }}"><i class="fas fa-download" aria-hidden="true"></i> Download {{ dataset.result_file.split(".")[-1] }}</a>
+                            {% if dataset.file_exists() %}
+                                {% if dataset.get_own_processor().map_item or dataset.get_annotation_fields() %}
+                                <a class="button-like" href="{{ url_for('get_mapped_result', key=dataset.key) }}"><i class="fas fa-download" aria-hidden="true"></i> Download csv</a>
+                                {% else %}
+                                <a class="button-like" href="{{ url_for('get_result', query_file=dataset.result_file) }}"><i class="fas fa-download" aria-hidden="true"></i> Download {{ dataset.result_file.split(".")[-1] }}</a>
+                                {% endif %}
                             {% endif %}
                         {% endif %}
                         </div>

From 4a92b1488c324674d922b8836bdc968532399307 Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Thu, 4 Jul 2024 16:03:21 +0200
Subject: [PATCH 071/204] Remove unused UserInput

---
 common/lib/user_input.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/common/lib/user_input.py b/common/lib/user_input.py
index 4de5478c2..dc7a9f547 100644
--- a/common/lib/user_input.py
+++ b/common/lib/user_input.py
@@ -36,7 +36,6 @@ class UserInput:
     OPTION_HUE = "hue"  # colour hue
     OPTION_DATASOURCES = "datasources"  # data source toggling
     OPTION_DATASOURCES_TABLE = "datasources_table" # a table with settings per data source
-    OPTION_DATASOURCES_TEXT = "datasources_text" # text input per data source (via dropdown)
 
     OPTIONS_COSMETIC = (OPTION_INFO, OPTION_DIVIDER)
 

From c4b19434af52ecc2a57f7c8d609aa080ae5cdc15 Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Thu, 4 Jul 2024 16:04:56 +0200
Subject: [PATCH 072/204] Remove unnecessary UserInput imports

---
 datasources/instagram/search_instagram.py    | 1 -
 datasources/linkedin/search_linkedin.py      | 1 -
 datasources/parler/search_parler.py          | 1 -
 datasources/tiktok/search_tiktok.py          | 1 -
 datasources/twitter-import/search_twitter.py | 1 -
 5 files changed, 5 deletions(-)

diff --git a/datasources/instagram/search_instagram.py b/datasources/instagram/search_instagram.py
index 642593220..4c096acd8 100644
--- a/datasources/instagram/search_instagram.py
+++ b/datasources/instagram/search_instagram.py
@@ -10,7 +10,6 @@
 from backend.lib.search import Search
 from common.lib.item_mapping import MappedItem, MissingMappedField
 from common.lib.exceptions import WorkerInterruptedException, MapItemException
-from common.lib.helpers import UserInput
 
 
 class SearchInstagram(Search):
diff --git a/datasources/linkedin/search_linkedin.py b/datasources/linkedin/search_linkedin.py
index 53d61a707..cddd27663 100644
--- a/datasources/linkedin/search_linkedin.py
+++ b/datasources/linkedin/search_linkedin.py
@@ -11,7 +11,6 @@
 
 from backend.lib.search import Search
 from common.lib.item_mapping import MappedItem
-from common.lib.helpers import UserInput
 
 class SearchLinkedIn(Search):
     """
diff --git a/datasources/parler/search_parler.py b/datasources/parler/search_parler.py
index 07b6116ce..8ccc7ccd8 100644
--- a/datasources/parler/search_parler.py
+++ b/datasources/parler/search_parler.py
@@ -10,7 +10,6 @@
 
 from backend.lib.search import Search
 from common.lib.item_mapping import MappedItem
-from common.lib.helpers import UserInput
 
 
 class SearchParler(Search):
diff --git a/datasources/tiktok/search_tiktok.py b/datasources/tiktok/search_tiktok.py
index b3214bc42..90f443b49 100644
--- a/datasources/tiktok/search_tiktok.py
+++ b/datasources/tiktok/search_tiktok.py
@@ -9,7 +9,6 @@
 
 from backend.lib.search import Search
 from common.lib.item_mapping import MappedItem
-from common.lib.helpers import UserInput
 
 
 class SearchTikTok(Search):
diff --git a/datasources/twitter-import/search_twitter.py b/datasources/twitter-import/search_twitter.py
index 51df5ef51..274045fb3 100644
--- a/datasources/twitter-import/search_twitter.py
+++ b/datasources/twitter-import/search_twitter.py
@@ -9,7 +9,6 @@
 from backend.lib.search import Search
 from common.lib.helpers import strip_tags
 from common.lib.item_mapping import MappedItem
-from common.lib.helpers import UserInput
 
 
 class SearchTwitterViaZeeschuimer(Search):

From fcb747301809ed96f0c97491b4e2dd64fe37f295 Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Thu, 4 Jul 2024 17:56:07 +0200
Subject: [PATCH 073/204] Use dictionary order as sort order for config
 settings

---
 common/lib/config_definition.py | 25 ++++++++++++++-----------
 webtool/views/views_admin.py    | 15 +++++++++++++--
 webtool/views/views_explorer.py |  4 ++--
 3 files changed, 29 insertions(+), 15 deletions(-)

diff --git a/common/lib/config_definition.py b/common/lib/config_definition.py
index 09d82f7ef..e1486a28b 100644
--- a/common/lib/config_definition.py
+++ b/common/lib/config_definition.py
@@ -4,18 +4,27 @@
 Possible options and their default values. Options are actually set in 4CAT"s
 Database. Additional options can be defined in Data sources or Processors as
 `config` objects.
+
+The order of th dictionary below determines the order of the settings in the interface.
+
 """
 from common.lib.user_input import UserInput
 import json
 
 config_definition = {
-    "datasources._intro": {
+    "datasources.intro": {
         "type": UserInput.OPTION_INFO,
         "help": "Data sources enabled below will be offered to people on the 'Create Dataset' page. Additionally, "
                 "people can upload datasets for these by for example exporting them with "
                 "[Zeeschuimer](https://github.com/digitalmethodsinitiative/zeeschuimer) to this 4CAT instance.\n\n"
                 "Some data sources offer further settings which may be configured on other tabs."
     },
+    "datasources.intro2": {
+        "type": UserInput.OPTION_INFO,
+        "help": "*Warning:* changes take effect immediately. Datasets that would have expired under the new settings "
+                "will be deleted. You can use the 'Dataset bulk management' module in the control panel to manage the "
+                "expiration status of existing datasets."
+    },
     "datasources.enabled": {
         "type": UserInput.OPTION_DATASOURCES,
         "default": ["ninegag", "douban", "douyin", "imgur", "upload", "instagram", "linkedin", "parler",
@@ -23,12 +32,6 @@
         "help": "Data Sources",
         "tooltip": "A list of enabled data sources that people can choose from when creating a dataset page."
     },
-    "datasources._intro2": {
-        "type": UserInput.OPTION_INFO,
-        "help": "*Warning:* changes take effect immediately. Datasets that would have expired under the new settings "
-                "will be deleted. You can use the 'Dataset bulk management' module in the control panel to manage the "
-                "expiration status of existing datasets."
-    },
     "datasources.expiration": {
         "type": UserInput.OPTION_TEXT_JSON,
         "default": {"fourchan": {"enabled": False, "allow_optout": False, "timeout": 0}, "eightchan": {"enabled": False, "allow_optout": False, "timeout": 0}, "eightkun": {"enabled": False, "allow_optout": False, "timeout": 0}, "ninegag": {"enabled": True, "allow_optout": False, "timeout": 0}, "bitchute": {"enabled": True, "allow_optout": False, "timeout": 0}, "dmi-tcat": {"enabled": False, "allow_optout": False, "timeout": 0}, "dmi-tcatv2": {"enabled": False, "allow_optout": False, "timeout": 0}, "douban": {"enabled": True, "allow_optout": False, "timeout": 0}, "douyin": {"enabled": True, "allow_optout": False, "timeout": 0}, "gab": {"enabled": True, "allow_optout": False, "timeout": 0}, "imgur": {"enabled": True, "allow_optout": False, "timeout": 0}, "upload": {"enabled": True, "allow_optout": False, "timeout": 0}, "instagram": {"enabled": True, "allow_optout": False, "timeout": 0}, "linkedin": {"enabled": True, "allow_optout": False, "timeout": 0}, "parler": {"enabled": True, "allow_optout": False, "timeout": 0}, "reddit": {"enabled": False, "allow_optout": False, "timeout": 0}, "telegram": {"enabled": True, "allow_optout": False, "timeout": 0}, "tiktok": {"enabled": True, "allow_optout": False, "timeout": 0}, "tiktok-urls": {"enabled": False, "allow_optout": False, "timeout": 0}, "truthsocial": {"enabled": True, "allow_optout": False, "timeout": 0}, "tumblr": {"enabled": False, "allow_optout": False, "timeout": 0}, "twitter": {"enabled": True, "allow_optout": False, "timeout": 0}, "twitterv2": {"enabled": False, "allow_optout": False, "timeout": 0}, "usenet": {"enabled": False, "allow_optout": False, "timeout": 0}, "vk": {"enabled": False, "allow_optout": False, "timeout": 0}},
@@ -305,13 +308,13 @@
         "global": True
     },
     # Explorer settings
-    "explorer.__basic-explanation": {
+    "explorer.basic-explanation": {
         "type": UserInput.OPTION_INFO,
         "help": "4CAT's Explorer feature lets you navigate and annotate datasets as if they "
                 "appared on their original platform. This is intended to facilitate qualitative "
                 "exploration and manual coding."
     },
-    "explorer.__max_posts": {
+    "explorer.max_posts": {
         "type": UserInput.OPTION_TEXT,
         "default": 100000,
         "help": "Amount of posts",
@@ -319,14 +322,14 @@
         "tooltip": "Maximum number of posts to be considered by the Explorer (prevents timeouts and "
                    "memory errors)"
     },
-    "explorer.__posts_per_page": {
+    "explorer.posts_per_page": {
         "type": UserInput.OPTION_TEXT,
         "default": 50,
         "help": "Posts per page",
         "coerce_type": int,
         "tooltip": "Number of posts to display per page"
     },
-    "explorer._config_explanation": {
+    "explorer.config_explanation": {
         "type": UserInput.OPTION_INFO,
         "help": "Per data source, you can enable or disable the Explorer. Posts will be formatted through a <em>generic</em> template "
                 "made of [this HTML file](https://github.com/digitalmethodsinitiative/4cat/tree/master/webtool/templates/explorer/"
diff --git a/webtool/views/views_admin.py b/webtool/views/views_admin.py
index d982f1042..fcd0c2e98 100644
--- a/webtool/views/views_admin.py
+++ b/webtool/views/views_admin.py
@@ -571,10 +571,12 @@ def manipulate_settings():
             flash("Invalid settings: %s" % str(e))
 
     all_settings = config.get_all(user=None, tags=[tag])
+
     options = {}
 
     changed_categories = set()
-    for option in sorted({*all_settings.keys(), *definition.keys()}):
+
+    for option in {*all_settings.keys(), *definition.keys()}:
         tag_value = all_settings.get(option, definition.get(option, {}).get("default"))
         global_value = global_settings.get(option, definition.get(option, {}).get("default"))
         is_changed = tag and global_value != tag_value
@@ -616,7 +618,16 @@ def manipulate_settings():
             changed_categories.add(option.split(".")[0])
 
     tab = "" if not request.form.get("current-tab") else request.form.get("current-tab")
-    options = {k: options[k] for k in sorted(options, key=lambda o: options[o]["tabname"])}
+
+    # We are ordering the options based on how they are ordered in their dictionaries,
+    # and not the database order. To do so, we're adding a simple config order number
+    # and sort on this.
+    config_order = 0
+    for k, v in definition.items():
+        options[k]["config_order"] = config_order
+        config_order += 1
+
+    options = {k: options[k] for k in sorted(options, key=lambda o: (options[o]["tabname"], options[o].get("config_order", 0)))}
 
     # 'data sources' is one setting but we want to be able to indicate
     # overrides per sub-item
diff --git a/webtool/views/views_explorer.py b/webtool/views/views_explorer.py
index 9ed869c0d..61fb467e6 100644
--- a/webtool/views/views_explorer.py
+++ b/webtool/views/views_explorer.py
@@ -66,10 +66,10 @@ def explorer_dataset(key, page=1):
 		return error(404, error="Explorer functionality disabled for %s." % datasource)
 
 	# The amount of posts to show on a page
-	posts_per_page = config.get("explorer.__posts_per_page", 50)
+	posts_per_page = config.get("explorer.posts_per_page", 50)
 
 	# The amount of posts that may be included (limit for large datasets)
-	max_posts = config.get('explorer.__max_posts', 500000)
+	max_posts = config.get('explorer.max_posts', 500000)
 
 	# The offset for posts depending on the current page
 	offset = ((page - 1) * posts_per_page) if page else 0

From e672933ad53f5cc0f34bca937bb640e4115118f9 Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Tue, 9 Jul 2024 12:36:38 +0200
Subject: [PATCH 074/204] Change name of "Explore" button to "Explore &
 annotate"

---
 webtool/templates/components/result-result-row.html | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/webtool/templates/components/result-result-row.html b/webtool/templates/components/result-result-row.html
index 253e83c34..bec6e0d49 100644
--- a/webtool/templates/components/result-result-row.html
+++ b/webtool/templates/components/result-result-row.html
@@ -47,7 +47,7 @@
     {% if __user_config("privileges.can_use_explorer") and has_explorer %}
     <li>
         <a href="{{ url_for('explorer_dataset', key=dataset.key) }}" class="tooltip-trigger" aria-controls="tooltip-explore-items-{{ dataset.key }}">
-            <i class="fa fa-binoculars" aria-hidden="true"></i> Explorer
+            <i class="fa fa-binoculars" aria-hidden="true"></i> Explore & annotate
         </a>
         <p role="tooltip" id="tooltip-explore-items-{{ dataset.key }}" aria-hidden="true">Explore, sort, and add annotations to data interactively</p>
     </li>

From 0d2eef2d034969151efd8739d8b0413dc31b64ea Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Tue, 9 Jul 2024 12:52:38 +0200
Subject: [PATCH 075/204] Space out Twitter metrics better

---
 webtool/static/css/explorer/twitter.css        | 18 ++++--------------
 .../explorer/datasource-templates/twitter.html |  8 ++++----
 2 files changed, 8 insertions(+), 18 deletions(-)

diff --git a/webtool/static/css/explorer/twitter.css b/webtool/static/css/explorer/twitter.css
index af9fa1c1a..debdceb02 100644
--- a/webtool/static/css/explorer/twitter.css
+++ b/webtool/static/css/explorer/twitter.css
@@ -70,23 +70,13 @@
 	margin-top: 20px;
 }
 
-.time, .metrics, .atname {
+.time, .metrics, .atname, .external-url a {
 	color: #7a8a97;
 }
 
-.posts .post .metrics span {
-	margin-right: 60px;
-}
-
-.verified {
-	color: rgb(29, 155, 240)
-}
-
-.posts .external-url {
-	position: absolute;
-	bottom: 10px;
-	right: 10px;
-	color: rgb(104, 119, 130);
+.posts .post .metrics {
+	display: flex;
+	justify-content: space-between;
 }
 
 span.hashtag {
diff --git a/webtool/templates/explorer/datasource-templates/twitter.html b/webtool/templates/explorer/datasource-templates/twitter.html
index 12fbf9c5c..fa238674a 100644
--- a/webtool/templates/explorer/datasource-templates/twitter.html
+++ b/webtool/templates/explorer/datasource-templates/twitter.html
@@ -72,11 +72,11 @@
 				<span class="retweets"><i class="fa-solid fa-retweet"></i> {{ post.get("retweet_count") }}</span>
 				<span class="likes"><i class="fa-solid fa-heart"></i> {{ post.get("like_count") | numberify }}</span>
 				{% if post.get("impression_count") %}<span class="impressions"><i class="fa-solid fa-chart-simple"></i> {{ post.get("impression_count") | numberify }}</span>{% endif %}
+				<!-- Possible external link, if not pseudonymised -->
+				{% if not pseudonymised %}
+				<span class="external-url" title="Go to original post"><a href="https://twitter.com/{{ post.author }}/status/{{ post.id }}" target="_blank"><i class="fas fa-external-link-alt"></i></a></span>
+				{% endif %}
 			</div>
 		</div>
-	<!-- Possible external link, if not pseudonymised -->
-	{% if not pseudonymised %}
-		<a href="https://twitter.com/{{ post.author }}/status/{{ post.id }}" target="_blank"><span class="external-url" title="Go to original post"><i class="fas fa-external-link-alt"></i></span></a>
-	{% endif %}
 	</div>
 </div>
\ No newline at end of file

From 7389e9b19d2ca1edf06e5854bc9c5069699d27e5 Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Tue, 9 Jul 2024 13:29:57 +0200
Subject: [PATCH 076/204] Include index in Explorer posts loop

---
 webtool/templates/explorer/explorer.html | 1 +
 1 file changed, 1 insertion(+)

diff --git a/webtool/templates/explorer/explorer.html b/webtool/templates/explorer/explorer.html
index 7e9ae2e14..92fb27298 100644
--- a/webtool/templates/explorer/explorer.html
+++ b/webtool/templates/explorer/explorer.html
@@ -53,6 +53,7 @@
 	<div class="explorer-content">
 		<ol class="posts" id="explorer-posts">
 		{% for post in posts %}
+			{% set post_count = loop.index %}
 			{% include "explorer/post.html" %}
 		{% endfor %}
 		</ol>

From b8e1267c3d19232ff013765c7f12daded3282660 Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Tue, 9 Jul 2024 17:58:18 +0200
Subject: [PATCH 077/204] Telegram Explorer template v0.5

---
 webtool/static/css/explorer/telegram.css      | 73 ++++++++++++++++++
 .../components/result-result-row.html         |  2 +-
 .../datasource-templates/telegram.html        | 74 +++++++++++++++++++
 webtool/templates/explorer/explorer.html      |  2 +-
 4 files changed, 149 insertions(+), 2 deletions(-)
 create mode 100644 webtool/static/css/explorer/telegram.css
 create mode 100644 webtool/templates/explorer/datasource-templates/telegram.html

diff --git a/webtool/static/css/explorer/telegram.css b/webtool/static/css/explorer/telegram.css
new file mode 100644
index 000000000..5b3a6c0a1
--- /dev/null
+++ b/webtool/static/css/explorer/telegram.css
@@ -0,0 +1,73 @@
+@font-face {
+    font-family: 'Open Sans';
+    src: url("../fonts/OpenSans-Regular.ttf")
+}
+
+@font-face {
+    font-family: 'Open Sans';
+    font-weight: bold;
+    src: url("../fonts/OpenSans-Bold.ttf")
+}
+
+@font-face {
+    font-family: 'Open Sans';
+    font-style: italic;
+    src: url("../fonts/OpenSans-Italic.ttf")
+}
+
+@font-face {
+    font-family: 'Open Sans';
+    font-weight: bold;
+    font-style: italic;
+    src: url("../fonts/OpenSans-BoldItalic.ttf")
+}
+
+* {
+	font-family: "Open Sans", Arial;
+	font-size: 16px;
+	line-height: 1.5;
+}
+
+.explorer-content ol li {
+	background-color: #6fa788;
+	padding: 1px;
+}
+
+.posts .post-content {
+	list-style-type: none;
+	background-color: white;
+	width: 450px;
+	margin: 0 auto;
+	margin-top: 2px;
+	margin-bottom: 2px;
+	border-radius: 5px 15px 15px 5px;
+	padding: 12px 17px;
+}
+
+.post-content.new-group {
+	border-radius: 0 10px 10px 10px;
+	margin-top: 10px;
+}
+
+.author {
+	color: #2984cd;
+	font-weight: bold;
+}
+
+.media-container img {
+	width: 100%;
+}
+
+.day {
+	margin: 8px;
+	text-align: center;
+	color: white;
+}
+
+.day span {
+	padding: 5px;
+	padding-left: 10px;
+	padding-right: 10px;
+	background-color: rgba(0,0,0,.3);
+	border-radius: 20px;
+}
\ No newline at end of file
diff --git a/webtool/templates/components/result-result-row.html b/webtool/templates/components/result-result-row.html
index bec6e0d49..06bd59290 100644
--- a/webtool/templates/components/result-result-row.html
+++ b/webtool/templates/components/result-result-row.html
@@ -49,7 +49,7 @@
         <a href="{{ url_for('explorer_dataset', key=dataset.key) }}" class="tooltip-trigger" aria-controls="tooltip-explore-items-{{ dataset.key }}">
             <i class="fa fa-binoculars" aria-hidden="true"></i> Explore & annotate
         </a>
-        <p role="tooltip" id="tooltip-explore-items-{{ dataset.key }}" aria-hidden="true">Explore, sort, and add annotations to data interactively</p>
+        <p role="tooltip" id="tooltip-explore-items-{{ dataset.key }}" aria-hidden="true">Explore, sort, and annotate the dataset</p>
     </li>
     {% endif %}
     </ul>
diff --git a/webtool/templates/explorer/datasource-templates/telegram.html b/webtool/templates/explorer/datasource-templates/telegram.html
new file mode 100644
index 000000000..417a5ddbc
--- /dev/null
+++ b/webtool/templates/explorer/datasource-templates/telegram.html
@@ -0,0 +1,74 @@
+<!-- We need to access the previous posts's info to show the feed data correctly
+	i.e. to group multiple messages by the same user and show days only once -->
+
+	{% set day = post.unix_timestamp | datetime(fmt="%d %B", wrap=False)  %}
+	{% set prev_post = posts[post_index - 1] if post_index > 0 else {} %}
+	{% set new_day = day if not prev_post or prev_post.get("unix_timestamp", 0) | datetime(fmt="%d %B", wrap=False) != day else False %}
+	{% set new_author = True if not prev_post or prev_post.author != post.author else False %}
+
+	{% if new_day %}
+	<div class="day">
+		<span>{{ new_day }}</span>
+	</div>
+	{% endif %}
+
+	<div class="post-table">
+		<div class="post-table-row">
+			{% if new_author or new_day %}
+			{% set author = post.author_username if not post.author_name else post.author_name %}
+			{% if post.author_name %}
+			<div class="profile-picture">
+				{% if not pseudonymised %}
+				<!-- Initials -->
+				{% for name in author_name.split() %}
+					{{ name[0] }}
+				{% endfor %}
+				{% else %}
+				<span title="Pseudonymous author" class="author"><i class="fa fa-user-secret tooltip-trigger"></i></span>
+				{% endif %}
+			</div>
+			<!-- Bubble icon -->
+			<div class="bubble-left">
+				<svg width="9px" height="20px" viewBox="0 0 9 20">
+					<g fill="none">
+						<path class="background" fill="#ffffff" d="M8,1 L9,1 L9,20 L8,20 L8,18 C7.807,15.161 7.124,12.233 5.950,9.218 C5.046,6.893 3.504,4.733 1.325,2.738 L1.325,2.738 C0.917,2.365 0.89,1.732 1.263,1.325 C1.452,1.118 1.72,1 2,1 L8,1 Z"></path>
+						<path class="border_1x" fill="#d7e3ec" d="M9,1 L2,1 C1.72,1 1.452,1.118 1.263,1.325 C0.89,1.732 0.917,2.365 1.325,2.738 C3.504,4.733 5.046,6.893 5.95,9.218 C7.124,12.233 7.807,15.161 8,18 L8,20 L9,20 L9,1 Z M2,0 L9,0 L9,20 L7,20 L7,20 L7.002,18.068 C6.816,15.333 6.156,12.504 5.018,9.58 C4.172,7.406 2.72,5.371 0.649,3.475 C-0.165,2.729 -0.221,1.464 0.525,0.649 C0.904,0.236 1.439,0 2,0 Z"></path>
+						<path class="border_2x" d="M9,1 L2,1 C1.72,1 1.452,1.118 1.263,1.325 C0.89,1.732 0.917,2.365 1.325,2.738 C3.504,4.733 5.046,6.893 5.95,9.218 C7.124,12.233 7.807,15.161 8,18 L8,20 L9,20 L9,1 Z M2,0.5 L9,0.5 L9,20 L7.5,20 L7.5,20 L7.501,18.034 C7.312,15.247 6.64,12.369 5.484,9.399 C4.609,7.15 3.112,5.052 0.987,3.106 C0.376,2.547 0.334,1.598 0.894,0.987 C1.178,0.677 1.579,0.5 2,0.5 Z"></path>
+						<path class="border_3x" d="M9,1 L2,1 C1.72,1 1.452,1.118 1.263,1.325 C0.89,1.732 0.917,2.365 1.325,2.738 C3.504,4.733 5.046,6.893 5.95,9.218 C7.124,12.233 7.807,15.161 8,18 L8,20 L9,20 L9,1 Z M2,0.667 L9,0.667 L9,20 L7.667,20 L7.667,20 L7.668,18.023 C7.477,15.218 6.802,12.324 5.64,9.338 C4.755,7.064 3.243,4.946 1.1,2.983 C0.557,2.486 0.52,1.643 1.017,1.1 C1.269,0.824 1.626,0.667 2,0.667 Z"></path>
+					</g>
+				</svg>
+			</div>
+			{% endif %}
+			<div class="post-content {% if new_author or new_day %}new-group{% endif %}">
+				<div class="author">
+				{% if not pseudonymised %}
+					{{ author }}
+				{% else %}
+					<span title="Pseudonymous author" class="author"><i class="fa fa-user-secret tooltip-trigger"></i></span>
+				{% endif %}
+				</div>
+			{% else %}
+			<div class="post-content">
+			{% endif %}
+				{% if post.attachment_type %}
+				<div class="media-container">
+					<!-- Media item -->
+					<a href="https://t.me/{{ post.chat }}/{{ post.id.split('-')[-1] }}" target="_blank"><img src="{{ url_for('static', filename='img/go-to-media.jpg') }}">
+					</a>
+				</div>
+				{% endif %}
+				<div class="body">
+					{{ post.body }}
+				</div>
+
+
+				<div class="time">
+					{{ post.unix_timestamp | datetime(fmt="%H:%M", wrap=False) }} UTC
+				</div>
+
+				{% if not pseudonymised %}
+				<span class="external-url" title="Go to original message"><a href="https://t.me/{{ post.chat }}/{{ post.id.split('-')[-1] }}" target="_blank"><i class="fas fa-external-link-alt"></i></a></span>
+				{% endif %}
+			</div>
+		</div>
+	</div>
\ No newline at end of file
diff --git a/webtool/templates/explorer/explorer.html b/webtool/templates/explorer/explorer.html
index 92fb27298..137864b5e 100644
--- a/webtool/templates/explorer/explorer.html
+++ b/webtool/templates/explorer/explorer.html
@@ -53,7 +53,7 @@
 	<div class="explorer-content">
 		<ol class="posts" id="explorer-posts">
 		{% for post in posts %}
-			{% set post_count = loop.index %}
+			{% set post_index = loop.index - 1 %}
 			{% include "explorer/post.html" %}
 		{% endfor %}
 		</ol>

From f3f6f41509da5165767577c1a4f44beeb32ed064 Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Wed, 10 Jul 2024 12:14:40 +0200
Subject: [PATCH 078/204] Add a string character counter template that also
 handles graphemes

---
 webtool/lib/template_filters.py | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/webtool/lib/template_filters.py b/webtool/lib/template_filters.py
index 108241d93..91b72a3f6 100644
--- a/webtool/lib/template_filters.py
+++ b/webtool/lib/template_filters.py
@@ -6,6 +6,7 @@
 import os
 import re
 import requests
+import regex
 
 from urllib.parse import urlencode, urlparse
 from webtool import app, config
@@ -236,6 +237,25 @@ def _jinja2_filter_social_mediafy(body, datasource=""):
 
 	return body
 
+@app.template_filter('string_counter')
+def _jinja2_filter_string_counter(string, is_emoji=False):
+	# Returns a dictionary with counts of characters in a string. 
+	# Also handles emojis.
+
+	# We need to convert multi-character emojis ("graphemes") to one character.
+	if is_emoji == True:
+		string = regex.finditer(r"\X", string) # \X matches graphemes
+		string = [m.group(0) for m in string]
+
+	# Count 'em
+	counter = {}
+	for s in string:
+		if s not in counter:
+			counter[s] = 0
+		counter[s] += 1
+
+	return counter 
+
 @app.template_filter('parameter_str')
 def _jinja2_filter_parameter_str(url):
 	# Returns the current URL parameters as a valid string.

From e904351e6fee40ad6c7eb45017ca5862d5e02872 Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Wed, 10 Jul 2024 14:15:06 +0200
Subject: [PATCH 079/204] Fix incorrect emoji handling with resolved references
 in Telegram

---
 datasources/telegram/search_telegram.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/datasources/telegram/search_telegram.py b/datasources/telegram/search_telegram.py
index 477cd9999..29e1b5195 100644
--- a/datasources/telegram/search_telegram.py
+++ b/datasources/telegram/search_telegram.py
@@ -641,7 +641,10 @@ def map_item(message):
 
         if message.get("reactions") and message["reactions"].get("results"):
             for reaction in message["reactions"]["results"]:
-                reactions += reaction["reaction"] * reaction["count"]
+                reaction_type = reaction["reaction"]
+                if isinstance(reaction_type, dict):
+                    reaction_type = reaction_type["emoticon"]
+                reactions += reaction_type * reaction["count"]
 
         return MappedItem({
             "id": f"{message['_chat']['username']}-{message['id']}",

From 9fcd9aaaa21fe696672727c60bd2984e267a735e Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Wed, 10 Jul 2024 17:18:49 +0200
Subject: [PATCH 080/204] Get markdown text from telegram messages

---
 datasources/telegram/search_telegram.py | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/datasources/telegram/search_telegram.py b/datasources/telegram/search_telegram.py
index 29e1b5195..89c5e321a 100644
--- a/datasources/telegram/search_telegram.py
+++ b/datasources/telegram/search_telegram.py
@@ -327,8 +327,10 @@ async def gather_posts(self, client, queries, max_items, min_date, max_date):
                 i = 0
                 try:
                     entity_posts = 0
+
                     async for message in client.iter_messages(entity=query, offset_date=max_date):
                         entity_posts += 1
+                        
                         i += 1
                         if self.interrupted:
                             raise ProcessorInterruptedException(
@@ -346,6 +348,7 @@ async def gather_posts(self, client, queries, max_items, min_date, max_date):
                         # the channel a message was forwarded from (but that
                         # needs extra API requests...)
                         serialized_message = SearchTelegram.serialize_obj(message)
+                        
                         if resolve_refs:
                             serialized_message = await self.resolve_groups(client, serialized_message)
 
@@ -646,6 +649,12 @@ def map_item(message):
                     reaction_type = reaction_type["emoticon"]
                 reactions += reaction_type * reaction["count"]
 
+        is_reply = False
+        reply_to = ""
+        if message.get("reply_to"):
+            is_reply = True
+            reply_to = message["reply_to"].get("reply_to_msg_id", "")
+
         return MappedItem({
             "id": f"{message['_chat']['username']}-{message['id']}",
             "thread_id": thread,
@@ -655,7 +664,8 @@ def map_item(message):
             "author_name": fullname,
             "author_is_bot": "yes" if user_is_bot else "no",
             "body": message["message"],
-            "reply_to": message.get("reply_to_msg_id", ""),
+            "is_reply": is_reply,
+            "reply_to": reply_to,
             "views": message["views"] if message["views"] else "",
             "forwards": message.get("forwards", MissingMappedField(0)),
             "reactions": reactions,
@@ -728,6 +738,7 @@ def serialize_obj(input_obj):
             obj = input_obj.copy()
 
         mapped_obj = {}
+
         for item, value in obj.items():
             if type(value) is datetime:
                 mapped_obj[item] = value.timestamp()
@@ -746,6 +757,11 @@ def serialize_obj(input_obj):
         # Add the _type if the original object was a telethon type
         if type(input_obj).__module__ in ("telethon.tl.types", "telethon.tl.custom.forward"):
             mapped_obj["_type"] = type(input_obj).__name__
+
+        # Store the markdown-formatted text
+        if hasattr(input_obj, "text"):
+            mapped_obj["message"] = input_obj.text
+
         return mapped_obj
 
     @staticmethod

From c0c7bfa949fa474085c0fd50b2775f0b1d110b64 Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Wed, 10 Jul 2024 17:33:32 +0200
Subject: [PATCH 081/204] ..but then a bit more elegant and also for resolved
 messages

---
 datasources/telegram/search_telegram.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/datasources/telegram/search_telegram.py b/datasources/telegram/search_telegram.py
index 89c5e321a..4c3ffcfc7 100644
--- a/datasources/telegram/search_telegram.py
+++ b/datasources/telegram/search_telegram.py
@@ -348,7 +348,7 @@ async def gather_posts(self, client, queries, max_items, min_date, max_date):
                         # the channel a message was forwarded from (but that
                         # needs extra API requests...)
                         serialized_message = SearchTelegram.serialize_obj(message)
-                        
+
                         if resolve_refs:
                             serialized_message = await self.resolve_groups(client, serialized_message)
 
@@ -759,7 +759,7 @@ def serialize_obj(input_obj):
             mapped_obj["_type"] = type(input_obj).__name__
 
         # Store the markdown-formatted text
-        if hasattr(input_obj, "text"):
+        if type(input_obj).__name__ == "Message":
             mapped_obj["message"] = input_obj.text
 
         return mapped_obj

From 92ac7c4c01499dee177702275a2ea9a9dc88961f Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Wed, 10 Jul 2024 17:33:58 +0200
Subject: [PATCH 082/204] styling

---
 webtool/templates/explorer/datasource-templates/tiktok.html | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/webtool/templates/explorer/datasource-templates/tiktok.html b/webtool/templates/explorer/datasource-templates/tiktok.html
index 882d87ac7..e89ef13bc 100644
--- a/webtool/templates/explorer/datasource-templates/tiktok.html
+++ b/webtool/templates/explorer/datasource-templates/tiktok.html
@@ -28,7 +28,7 @@
 			</header>
 
 			<!-- Post body -->
-			<span class="body">{{ post.body | social_mediafy(datasource='tiktok') | safe }}</span>
+			<span class="body">{{ post.body | social_mediafy(datasource="tiktok") | safe }}</span>
 			
 			<!-- Video thumbnail item -->
 			<div class="post-media">

From d4256aeca852467239e22a40b0a8ba7a3b0c3a2a Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Wed, 10 Jul 2024 17:34:12 +0200
Subject: [PATCH 083/204] Telegram template v1.0

---
 webtool/lib/template_filters.py               |  42 ++--
 webtool/static/css/explorer/telegram.css      | 214 +++++++++++++++---
 .../datasource-templates/telegram.html        |  91 +++++---
 3 files changed, 263 insertions(+), 84 deletions(-)

diff --git a/webtool/lib/template_filters.py b/webtool/lib/template_filters.py
index 91b72a3f6..d5d12385d 100644
--- a/webtool/lib/template_filters.py
+++ b/webtool/lib/template_filters.py
@@ -186,11 +186,6 @@ def _jinja2_filter_social_mediafy(body, datasource=""):
 	if not datasource:
 		return body
 
-	# Supported data sources
-	known_datasources = ["twitter", "tiktok", "instagram", "tumblr", "linkedin"]
-	if datasource not in known_datasources:
-		return body
-
 	# Base URLs after which tags and @-mentions follow, per platform
 	base_urls = {
 		"twitter": {
@@ -212,38 +207,47 @@ def _jinja2_filter_social_mediafy(body, datasource=""):
 		"linkedin": {
 			"hashtag": "https://linkedin.com/feed/hashtag/?keywords=",
 			"mention": "https://linkedin.com/in/"
+		},
+		"telegram": {
 		}
 	}
 
+	# Supported data sources
+	known_datasources = list(base_urls.keys())
+	if datasource not in known_datasources:
+		return body
+
 	# Add URL links
 	for url in urls_from_text(body):
 		body = re.sub(url, "<a href='%s' target='_blank'>%s</a>" % (url, url), body)
 
 	# Add hashtag links
-	tags = re.findall(r"#[\w0-9]+", body)
-	# We're sorting tags by length so we don't incorrectly
-	# replace tags that are a substring of another, longer tag.
-	tags = sorted(tags, key=lambda x: len(x), reverse=True)
-	for tag in tags:
-		# Match the string, but not if it's preceded by a >, which indicates that we've already added an <a> tag.
-		# This avoids problems with repeated substrings (e.g. #Dog and #DogOwners).
-		body = re.sub(r"(?<!'>)(" + tag + ")", "<a href='%s' target='_blank'>%s</a>" % (base_urls[datasource]["hashtag"] + tag[1:], tag), body)
+	if "hasthag"  in base_urls[datasource]:
+		tags = re.findall(r"#[\w0-9]+", body)
+		# We're sorting tags by length so we don't incorrectly
+		# replace tags that are a substring of another, longer tag.
+		tags = sorted(tags, key=lambda x: len(x), reverse=True)
+		for tag in tags:
+			# Match the string, but not if it's preceded by a >, which indicates that we've already added an <a> tag.
+			# This avoids problems with repeated substrings (e.g. #Dog and #DogOwners).
+			body = re.sub(r"(?<!'>)(" + tag + ")", "<a href='%s' target='_blank'>%s</a>" % (base_urls[datasource]["hashtag"] + tag[1:], tag), body)
 
 	# Add @-mention links
-	mentions = re.findall(r"@[\w0-9]+", body)
-	mentions = sorted(mentions, key=lambda x: len(x), reverse=True)
-	for mention in mentions:
-		body = re.sub(r"(?<!>)(" + mention + ")", "<a href='%s' target='_blank'>%s</a>" % (base_urls[datasource]["mention"] + mention[1:], mention), body)
+	if "mention"  in base_urls[datasource]:
+		mentions = re.findall(r"@[\w0-9]+", body)
+		mentions = sorted(mentions, key=lambda x: len(x), reverse=True)
+		for mention in mentions:
+			body = re.sub(r"(?<!>)(" + mention + ")", "<a href='%s' target='_blank'>%s</a>" % (base_urls[datasource]["mention"] + mention[1:], mention), body)
 
 	return body
 
 @app.template_filter('string_counter')
-def _jinja2_filter_string_counter(string, is_emoji=False):
+def _jinja2_filter_string_counter(string, emoji=False):
 	# Returns a dictionary with counts of characters in a string. 
 	# Also handles emojis.
 
 	# We need to convert multi-character emojis ("graphemes") to one character.
-	if is_emoji == True:
+	if emoji == True:
 		string = regex.finditer(r"\X", string) # \X matches graphemes
 		string = [m.group(0) for m in string]
 
diff --git a/webtool/static/css/explorer/telegram.css b/webtool/static/css/explorer/telegram.css
index 5b3a6c0a1..b4789b9e2 100644
--- a/webtool/static/css/explorer/telegram.css
+++ b/webtool/static/css/explorer/telegram.css
@@ -1,65 +1,186 @@
 @font-face {
-    font-family: 'Open Sans';
+    font-family: "Open Sans";
     src: url("../fonts/OpenSans-Regular.ttf")
 }
 
 @font-face {
-    font-family: 'Open Sans';
+    font-family: "Open Sans";
     font-weight: bold;
     src: url("../fonts/OpenSans-Bold.ttf")
 }
 
-@font-face {
-    font-family: 'Open Sans';
-    font-style: italic;
-    src: url("../fonts/OpenSans-Italic.ttf")
+* {
+	font-size: 15px;
+	line-height: 1.4;
 }
 
-@font-face {
-    font-family: 'Open Sans';
-    font-weight: bold;
-    font-style: italic;
-    src: url("../fonts/OpenSans-BoldItalic.ttf")
+.explorer-content {
+	background-image: linear-gradient(#6ca587, #c4d18b);
+	padding-top: 5px;
+	padding-bottom: 20px;
 }
 
-* {
+.explorer-content ol li {
+	padding: 1px;
+	background: none;
+}
+
+.posts .post {
 	font-family: "Open Sans", Arial;
-	font-size: 16px;
-	line-height: 1.5;
+	display: block;
+	position: relative;
+	max-width: 580px;
+	list-style-type: none;
+	margin: 0 auto;
 }
 
-.explorer-content ol li {
-	background-color: #6fa788;
-	padding: 1px;
+.posts .post .post-container.new-group {
+	margin-top: 6px;
+}
+
+/* Profile picture  */
+.posts .post .profile-picture-container {
+	display: inline-block;
+	width: 60px;
+	vertical-align: top;
+}
+
+.profile-picture {
+	background-image: linear-gradient(#389ed5, #59c8e2);
+	border-radius: 100%;
+	width: 50px;
+    height: 50px;
+    line-height: 53px;
+    float: left;
+    text-align: center;
 }
 
-.posts .post-content {
+.profile-picture .initials {
+    color: white;
+    font-size: 23px;
+    width: 100%;
+    height: 100%;
+}
+
+/* Post content */
+.posts .post .post-content {
+	display: inline-block;
+	max-width: 80%;
 	list-style-type: none;
 	background-color: white;
-	width: 450px;
-	margin: 0 auto;
-	margin-top: 2px;
-	margin-bottom: 2px;
-	border-radius: 5px 15px 15px 5px;
+	border-radius: 5px 20px 20px 5px;
 	padding: 12px 17px;
+	z-index: -1;
+	overflow: hidden;
 }
 
-.post-content.new-group {
-	border-radius: 0 10px 10px 10px;
-	margin-top: 10px;
+.posts .post .post-content.new-group {
+	border-radius: 0px 20px 20px 5px;
 }
 
-.author {
+.bubble-left {
+	position: relative;
+	margin-right: -5px;
+	float: right;
+	z-index: 0;
+}
+
+.author, .author a, .author a:hover {
+	margin-bottom: 5px;
 	color: #2984cd;
 	font-weight: bold;
+	text-decoration: none;
+}
+
+.posts .post .body {
+	display: inline;
+	padding-top: 5px;
+	padding-bottom: 5px;
+}
+
+.posts .post .body a {
+	color: #2984cd;
+}
+
+.posts .post .reply_to {
+	height: 20px;
+	padding: 5px;
+	margin-bottom: 2px;
+	background-color: #e4f1f9;
+	border-left: 4px solid #2e96d2;
+	border-radius: 5px;
+}
+
+.media-container {
+	max-height: 200px;
+	margin-top: -12px;
+	margin-left: -17px;
+	margin-right: -17px;
+	margin-bottom: 10px;
+	overflow: hidden;
 }
 
 .media-container img {
+	margin-top: -155px;
 	width: 100%;
 }
 
+.post-content.new-group .media-container {
+	margin-top: 10px;
+}
+
+.post-content.new-group .media-container img {
+	margin-top: -155px;
+	border-radius: 0px;
+}
+
+/* Emoji reaction counts */
+.reactions {
+	margin-top: 3px;
+	margin-bottom: 3px;
+}
+
+.reaction {
+	display: inline-block;
+	color: #168acd;
+	background-color: #e8f5fc;
+	font-weight: bold;
+	border-radius: 15px;
+	margin-top: 1px;
+	padding: 4px;
+	padding-left: 8px;
+	padding-right: 8px;
+	font-size: 16px;
+	vertical-align: middle;
+}
+
+.reaction .reaction-count {
+	padding-left: 4px;
+	font-size: 14px;
+}
+
+/* TOD on the bottom of the post */
+.metrics {
+	display: inline-block;
+	padding-left: 10px;
+	float: right;
+}
+
+.metrics span {
+	font-size: 14px;
+	padding-left: 3px;
+	color: #a0acb6;
+}
+
+/* External url button */
+
+.external-url i {
+	color: #168acd;
+}
+
+/* Day indicator between posts  */
 .day {
-	margin: 8px;
+	margin: 15px;
 	text-align: center;
 	color: white;
 }
@@ -70,4 +191,41 @@
 	padding-right: 10px;
 	background-color: rgba(0,0,0,.3);
 	border-radius: 20px;
+}
+
+/** --------------------- *
+     Annotation post elements
+  * --------------------- */
+.post-annotations {
+	background-image: linear-gradient(#389ed5, #59c8e2);
+    color: white;
+    border-radius: 5px 20px 20px 5px;
+    margin-left: 63px;
+}
+
+.post-annotation {
+    padding: 15px;
+}
+
+.post-annotation > .annotation-label {
+    display: inline-block;
+    vertical-align: middle;
+    text-align: right;
+    min-width: 150px;
+    margin-right: 5px;
+    line-height: 1.6em;
+    overflow-x: hidden;
+}
+
+.post-annotation.checkbox > .post-annotation-options {
+    display: inline-block;
+}
+
+.post-annotation-options {
+    display: inline-block;
+    vertical-align: top;
+}
+
+.post-annotation-options > input {
+    display: inline-block;
 }
\ No newline at end of file
diff --git a/webtool/templates/explorer/datasource-templates/telegram.html b/webtool/templates/explorer/datasource-templates/telegram.html
index 417a5ddbc..e7844392c 100644
--- a/webtool/templates/explorer/datasource-templates/telegram.html
+++ b/webtool/templates/explorer/datasource-templates/telegram.html
@@ -12,60 +12,77 @@
 	</div>
 	{% endif %}
 
-	<div class="post-table">
-		<div class="post-table-row">
-			{% if new_author or new_day %}
-			{% set author = post.author_username if not post.author_name else post.author_name %}
-			{% if post.author_name %}
+	<div class="post-container{% if new_author or new_day %} new-group{% endif %}">
+		<div class="profile-picture-container">
+		{% if new_author or new_day %}
 			<div class="profile-picture">
-				{% if not pseudonymised %}
-				<!-- Initials -->
-				{% for name in author_name.split() %}
-					{{ name[0] }}
-				{% endfor %}
+				<span class="initials">
+				{% set author = post.author_username if not post.author_name else post.author_name %}
+				{% if not pseudonymised and author %}
+					<!-- Initials -->
+					{% for name in author.split()[:2] %}{{ name[0] }}{% endfor %}
 				{% else %}
-				<span title="Pseudonymous author" class="author"><i class="fa fa-user-secret tooltip-trigger"></i></span>
+					<span title="Pseudonymous author"><i class="fa fa-user-secret"></i></span>
 				{% endif %}
+				</span>
 			</div>
 			<!-- Bubble icon -->
-			<div class="bubble-left">
+			<span class="bubble-left">
 				<svg width="9px" height="20px" viewBox="0 0 9 20">
 					<g fill="none">
 						<path class="background" fill="#ffffff" d="M8,1 L9,1 L9,20 L8,20 L8,18 C7.807,15.161 7.124,12.233 5.950,9.218 C5.046,6.893 3.504,4.733 1.325,2.738 L1.325,2.738 C0.917,2.365 0.89,1.732 1.263,1.325 C1.452,1.118 1.72,1 2,1 L8,1 Z"></path>
-						<path class="border_1x" fill="#d7e3ec" d="M9,1 L2,1 C1.72,1 1.452,1.118 1.263,1.325 C0.89,1.732 0.917,2.365 1.325,2.738 C3.504,4.733 5.046,6.893 5.95,9.218 C7.124,12.233 7.807,15.161 8,18 L8,20 L9,20 L9,1 Z M2,0 L9,0 L9,20 L7,20 L7,20 L7.002,18.068 C6.816,15.333 6.156,12.504 5.018,9.58 C4.172,7.406 2.72,5.371 0.649,3.475 C-0.165,2.729 -0.221,1.464 0.525,0.649 C0.904,0.236 1.439,0 2,0 Z"></path>
+						<path class="border_1x" fill="#ffffff" d="M9,1 L2,1 C1.72,1 1.452,1.118 1.263,1.325 C0.89,1.732 0.917,2.365 1.325,2.738 C3.504,4.733 5.046,6.893 5.95,9.218 C7.124,12.233 7.807,15.161 8,18 L8,20 L9,20 L9,1 Z M2,0 L9,0 L9,20 L7,20 L7,20 L7.002,18.068 C6.816,15.333 6.156,12.504 5.018,9.58 C4.172,7.406 2.72,5.371 0.649,3.475 C-0.165,2.729 -0.221,1.464 0.525,0.649 C0.904,0.236 1.439,0 2,0 Z"></path>
 						<path class="border_2x" d="M9,1 L2,1 C1.72,1 1.452,1.118 1.263,1.325 C0.89,1.732 0.917,2.365 1.325,2.738 C3.504,4.733 5.046,6.893 5.95,9.218 C7.124,12.233 7.807,15.161 8,18 L8,20 L9,20 L9,1 Z M2,0.5 L9,0.5 L9,20 L7.5,20 L7.5,20 L7.501,18.034 C7.312,15.247 6.64,12.369 5.484,9.399 C4.609,7.15 3.112,5.052 0.987,3.106 C0.376,2.547 0.334,1.598 0.894,0.987 C1.178,0.677 1.579,0.5 2,0.5 Z"></path>
 						<path class="border_3x" d="M9,1 L2,1 C1.72,1 1.452,1.118 1.263,1.325 C0.89,1.732 0.917,2.365 1.325,2.738 C3.504,4.733 5.046,6.893 5.95,9.218 C7.124,12.233 7.807,15.161 8,18 L8,20 L9,20 L9,1 Z M2,0.667 L9,0.667 L9,20 L7.667,20 L7.667,20 L7.668,18.023 C7.477,15.218 6.802,12.324 5.64,9.338 C4.755,7.064 3.243,4.946 1.1,2.983 C0.557,2.486 0.52,1.643 1.017,1.1 C1.269,0.824 1.626,0.667 2,0.667 Z"></path>
 					</g>
 				</svg>
+			</span>
+		</div>
+		<div class="post-content {% if new_author or new_day %}new-group{% endif %}">
+			<div class="author">
+			{% if not pseudonymised %}
+				<a href="https://t.me/{{ author }}" target="_blank">{{ author }}</a>
+			{% else %}
+				<span title="Pseudonymous author" class="author"><i class="fa fa-user-secret tooltip-trigger"></i></span>
+			{% endif %}
+			</div>
+		{% else %}
+		</div>
+		<div class="post-content {% if new_author or new_day %}new-group{% endif %}">
+		{% endif %}
+			{% if post.attachment_type %}
+			<div class="media-container">
+				<!-- Media item -->
+				<a href="https://t.me/{{ post.chat }}/{{ post.id.split('-')[-1] }}" target="_blank"><img src="{{ url_for('static', filename='img/go-to-media.jpg') }}">
+				</a>
 			</div>
 			{% endif %}
-			<div class="post-content {% if new_author or new_day %}new-group{% endif %}">
-				<div class="author">
-				{% if not pseudonymised %}
-					{{ author }}
-				{% else %}
-					<span title="Pseudonymous author" class="author"><i class="fa fa-user-secret tooltip-trigger"></i></span>
+			<div class="body">
+				{% if post.reply_to %}
+					<div class="reply_to">{% if not pseudonymised %}<a href="https://t.me/{{ post.chat }}/{{ post.reply_to }}" target="_blank">{% endif %}#{{ post.reply_to }}{% if not pseudonymised %}</a>{% endif %}</div>
 				{% endif %}
-				</div>
-			{% else %}
-			<div class="post-content">
+				{{ post.body | markdown | social_mediafy(datasource="telegram") | safe }}
+			</div>
+
+			{% if post.reactions %}
+			<div class="reactions">
+				{% set reactions = post.reactions|string_counter(emoji=True) %}
+				{% for reaction, count in reactions.items() %}
+					<span class="reaction">{{ reaction }}<span class="reaction-count">{{ count }}</span></span>
+				{% endfor %}
+			</div>
 			{% endif %}
-				{% if post.attachment_type %}
-				<div class="media-container">
-					<!-- Media item -->
-					<a href="https://t.me/{{ post.chat }}/{{ post.id.split('-')[-1] }}" target="_blank"><img src="{{ url_for('static', filename='img/go-to-media.jpg') }}">
-					</a>
-				</div>
+			<div class="metrics">
+				{% if post.views %}<span class="views"><i class="fas fa-eye"></i> {{ post.views|numberify }}</span>{% endif %}
+				{% if post.forwards %}<span class="forwards"><i class="fas fa-share"></i> {{ post.forwards|numberify }}</span>{% endif %}
+				<span class="time">
+				{{ post.unix_timestamp | datetime(fmt="%H:%M", wrap=False) }} UTC
+				</span>
+				{% if post.unix_timestamp_edited %}
+				<span class="time-edited">
+				| edited {{ post.unix_timestamp_edited | datetime(fmt="%H:%M", wrap=False) }} UTC
+				</span>
 				{% endif %}
-				<div class="body">
-					{{ post.body }}
-				</div>
-
-
-				<div class="time">
-					{{ post.unix_timestamp | datetime(fmt="%H:%M", wrap=False) }} UTC
-				</div>
-
 				{% if not pseudonymised %}
 				<span class="external-url" title="Go to original message"><a href="https://t.me/{{ post.chat }}/{{ post.id.split('-')[-1] }}" target="_blank"><i class="fas fa-external-link-alt"></i></a></span>
 				{% endif %}

From 2204cf70799e2cab8d2f7f4ac4570dbca0cf2244 Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Wed, 10 Jul 2024 17:43:26 +0200
Subject: [PATCH 084/204] Show URLs nicely in Telegram template

---
 webtool/lib/template_filters.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/webtool/lib/template_filters.py b/webtool/lib/template_filters.py
index d5d12385d..89a1e7336 100644
--- a/webtool/lib/template_filters.py
+++ b/webtool/lib/template_filters.py
@@ -218,8 +218,9 @@ def _jinja2_filter_social_mediafy(body, datasource=""):
 		return body
 
 	# Add URL links
-	for url in urls_from_text(body):
-		body = re.sub(url, "<a href='%s' target='_blank'>%s</a>" % (url, url), body)
+	if datasource != "telegram": # Telegram has mardown links
+		for url in urls_from_text(body):
+			body = re.sub(url, "<a href='%s' target='_blank'>%s</a>" % (url, url), body)
 
 	# Add hashtag links
 	if "hasthag"  in base_urls[datasource]:

From a904e65f7cc348ee460e83c7b477aa9babd6fe98 Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Fri, 12 Jul 2024 17:16:32 +0200
Subject: [PATCH 085/204] Add markdown text to Telegram

---
 datasources/telegram/search_telegram.py                     | 6 +++---
 .../templates/explorer/datasource-templates/telegram.html   | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/datasources/telegram/search_telegram.py b/datasources/telegram/search_telegram.py
index 4c3ffcfc7..4ab7f9ee4 100644
--- a/datasources/telegram/search_telegram.py
+++ b/datasources/telegram/search_telegram.py
@@ -39,8 +39,7 @@ class SearchTelegram(Search):
     extension = "ndjson"  # extension of result file, used internally and in UI
     is_local = False  # Whether this datasource is locally scraped
     is_static = False  # Whether this datasource is still updated
-    has_explorer_preset = True # Whether this data source has preset CSS and field settings for the Explorer 
-
+    
     # cache
     details_cache = None
     failures_cache = None
@@ -664,6 +663,7 @@ def map_item(message):
             "author_name": fullname,
             "author_is_bot": "yes" if user_is_bot else "no",
             "body": message["message"],
+            "body_markdown": message["message_markdown"], 
             "is_reply": is_reply,
             "reply_to": reply_to,
             "views": message["views"] if message["views"] else "",
@@ -760,7 +760,7 @@ def serialize_obj(input_obj):
 
         # Store the markdown-formatted text
         if type(input_obj).__name__ == "Message":
-            mapped_obj["message"] = input_obj.text
+            mapped_obj["message_markdown"] = input_obj.text
 
         return mapped_obj
 
diff --git a/webtool/templates/explorer/datasource-templates/telegram.html b/webtool/templates/explorer/datasource-templates/telegram.html
index e7844392c..f3b8dd9e4 100644
--- a/webtool/templates/explorer/datasource-templates/telegram.html
+++ b/webtool/templates/explorer/datasource-templates/telegram.html
@@ -61,7 +61,7 @@
 				{% if post.reply_to %}
 					<div class="reply_to">{% if not pseudonymised %}<a href="https://t.me/{{ post.chat }}/{{ post.reply_to }}" target="_blank">{% endif %}#{{ post.reply_to }}{% if not pseudonymised %}</a>{% endif %}</div>
 				{% endif %}
-				{{ post.body | markdown | social_mediafy(datasource="telegram") | safe }}
+				{{ post.body_markdown | markdown | social_mediafy(datasource="telegram") | safe }}
 			</div>
 
 			{% if post.reactions %}

From bb96af7cc27c8f18e5673200e766ef7ef1bbe2f0 Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Fri, 12 Jul 2024 17:16:42 +0200
Subject: [PATCH 086/204] Typo in Truth social search

---
 datasources/truth/search_truth.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/datasources/truth/search_truth.py b/datasources/truth/search_truth.py
index 52057e0fa..c1b10ad8a 100644
--- a/datasources/truth/search_truth.py
+++ b/datasources/truth/search_truth.py
@@ -35,7 +35,7 @@ def map_item(post):
         """
         Parse Truth Social post
 
-        :param node:  Data as received from Truth Social
+        :param post:  Data as received from Truth Social
         :return dict:  Mapped item
         """
         

From bc4f566528f946453aaf4f9d2a13ce5bcd04fdc9 Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Fri, 12 Jul 2024 17:17:27 +0200
Subject: [PATCH 087/204] Update Tumblr search so it works with the Neue Posts
 Format.

---
 datasources/tumblr/search_tumblr.py | 278 +++++++++++++++-------------
 1 file changed, 146 insertions(+), 132 deletions(-)

diff --git a/datasources/tumblr/search_tumblr.py b/datasources/tumblr/search_tumblr.py
index 89784b9e3..ae8876a83 100644
--- a/datasources/tumblr/search_tumblr.py
+++ b/datasources/tumblr/search_tumblr.py
@@ -6,13 +6,17 @@
 
 import time
 import pytumblr
+import requests
 from requests.exceptions import ConnectionError
 from datetime import datetime
+from ural import urls_from_text
 
 from common.config_manager import config
 from backend.lib.search import Search
-from common.lib.helpers import UserInput
+from common.lib.helpers import UserInput, strip_tags
 from common.lib.exceptions import QueryParametersException, ProcessorInterruptedException, ConfigException
+from common.lib.item_mapping import MappedItem
+
 
 __author__ = "Sal Hagen"
 __credits__ = ["Sal Hagen", "Tumblr API (api.tumblr.com)"]
@@ -27,7 +31,7 @@ class SearchTumblr(Search):
 	category = "Search"  # category
 	title = "Search Tumblr"  # title displayed in UI
 	description = "Retrieve Tumblr posts by hashtag or blog."  # description displayed in UI
-	extension = "csv"  # extension of result file, used internally and in UI
+	extension = "ndjson"  # extension of result file, used internally and in UI
 	is_local = False	# Whether this datasource is locally scraped
 	is_static = False	# Whether this datasource is still updated
 
@@ -88,8 +92,8 @@ def get_options(cls, parent_dataset=None, user=None):
 						"at max. Insert tags or names of blogs, one on each line. You may insert up to ten tags or "
 						"blogs.\n\nTumblr tags may include whitespace and commas. A `#` before the tag is optional.\n\n"
 						"Tag search only get posts explicitly associated with the exact tag you insert here. Querying "
-						"`gogh` will thus not get posts only tagged with `van gogh`. Keyword search is unfortunately not "
-						"allowed by the [Tumblr API](https://api.tumblr.com).\n\nIf 4CAT reached its Tumblr API rate "
+						"`gogh` will thus not get posts only tagged with `van gogh`. Keyword search is not "
+						"allowed by the [Tumblr API](https://api.tumblr.com).\n\nIf this 4CAT reached its Tumblr API rate "
 						"limit, try again 24 hours later."
 			},
 			"search_scope": {
@@ -181,6 +185,7 @@ def get_items(self, query):
 		queries = parameters.get("query").split(", ")
 		fetch_reblogs = parameters.get("fetch_reblogs", False)
 
+
 		# Store all info here
 		results = []
 
@@ -211,12 +216,18 @@ def get_items(self, query):
 			self.dataset.finish_with_error(f"Could not connect to Tumblr API: {client_info.get('meta', {}).get('status', '')} - {client_info.get('meta', {}).get('msg', '')}")
 			return
 
+
 		# for each tag or blog, get post
 		for query in queries:
 
 				# Get posts per tag
 				if scope == "tag":
-					new_results = self.get_posts_by_tag(query, max_date=max_date, min_date=min_date)
+					# Used for getting tagged posts, which uses requests instead.
+					api_key = self.parameters.get("consumer_key")
+					if not api_key:
+						api_key = SearchTumblr.get_tumblr_keys(self.owner)[0]
+
+					new_results = self.get_posts_by_tag(query, max_date=max_date, min_date=min_date, api_key=api_key)
 
 				# Get posts per blog
 				elif scope == "blog":
@@ -278,13 +289,22 @@ def get_items(self, query):
 							self.dataset.update_status(f"ConnectionRefused: Unable to collect reblogs for post {key}")
 							continue
 						if reblog_post:
-							reblog_post = self.parse_tumblr_posts([reblog_post], reblog=True)
 							results.append(reblog_post[0])
 
+		# Rename some keys so it works with anonymisation
+		for i in range(len(results)):
+			for key in list(results[i].keys()):
+				if key.startswith("blog"):
+					results[i][key.replace("blog", "author")] = results[i].pop(key)
+				elif key == "post_url":
+					results[i]["author_post_url"] = results[i].pop("post_url")
+				elif key == "slug":
+					results[i]["author_post_slug"] = results[i].pop("slug")
+
 		self.job.finish()
 		return results
 
-	def get_posts_by_tag(self, tag, max_date=None, min_date=None):
+	def get_posts_by_tag(self, tag, max_date=None, min_date=None, api_key=None):
 		"""
 		Get Tumblr posts posts with a certain tag
 		:param tag, str: the tag you want to look for
@@ -324,8 +344,21 @@ def get_posts_by_tag(self, tag, max_date=None, min_date=None):
 				break
 
 			try:
-				# Use the pytumblr library to make the API call
-				posts = self.client.tagged(tag, before=max_date, limit=20, filter="raw")
+				# PyTumblr does not allow to use the `npf` parameter yet 
+				# for the `tagged` endpoint (opened a pull request), so
+				# we're using requests here.
+				params = {
+					"tag": tag,
+					"api_key": api_key,
+					"before": max_date,
+					"limit": 20,
+					"filter": "raw",
+					"npf": True
+				}
+				url = "https://api.tumblr.com/v2/tagged"
+				response = requests.get(url, params=params)
+				posts = response.json()["response"]
+				
 			except ConnectionError:
 				self.update_status("Encountered a connection error, waiting 10 seconds.")
 				time.sleep(10)
@@ -346,6 +379,7 @@ def get_posts_by_tag(self, tag, max_date=None, min_date=None):
 					retries = 0
 					if check_post["id"] not in self.seen_ids:
 						unseen_posts.append(check_post)
+
 			posts = unseen_posts
 
 			# For no clear reason, the Tumblr API sometimes provides posts with a higher timestamp than requested.
@@ -390,8 +424,6 @@ def get_posts_by_tag(self, tag, max_date=None, min_date=None):
 			# Append posts to main list
 			else:
 
-				posts = self.parse_tumblr_posts(posts)
-
 				# Get all timestamps and sort them.
 				post_dates = sorted([post["timestamp"] for post in posts])
 
@@ -515,13 +547,9 @@ def get_posts_by_blog(self, blog, max_date=None, min_date=None):
 
 			try:
 				# Use the pytumblr library to make the API call
-				posts = self.client.posts(blog, before=max_date, limit=20, reblog_info=True, notes_info=True, filter="raw")
+				posts = self.client.posts(blog, before=max_date, limit=20, reblog_info=True, notes_info=True, filter="raw", npf=True)
 				posts = posts["posts"]
 
-				#if (max_date - posts[0]["timestamp"]) > 500000:
-					#self.dataset.update_status("ALERT - DATES LIKELY SKIPPED")
-					#self.dataset.update_status([post["timestamp"] for post in posts])
-
 			except Exception as e:
 
 				self.dataset.update_status("Reached the limit of the Tumblr API. Last timestamp: %s" % str(max_date))
@@ -543,8 +571,6 @@ def get_posts_by_blog(self, blog, max_date=None, min_date=None):
 						if "notes" in post:
 							all_notes.append(post["notes"])
 
-				posts = self.parse_tumblr_posts(posts)
-
 				# Get the lowest date
 				max_date = sorted([post["timestamp"] for post in posts])[0]
 
@@ -564,10 +590,6 @@ def get_posts_by_blog(self, blog, max_date=None, min_date=None):
 
 				all_posts += posts
 
-				#if (max_date - posts[len(posts) - 1]["timestamp"]) > 500000:
-					#self.dataset.update_status("ALERT - DATES LIKELY SKIPPED")
-					#self.dataset.update_status([post["timestamp"] for post in posts])
-
 			if len(all_posts) >= self.max_posts:
 				self.max_posts_reached = True
 				break
@@ -576,10 +598,10 @@ def get_posts_by_blog(self, blog, max_date=None, min_date=None):
 
 		return all_posts, all_notes
 
-	def get_post_notes(self, di_blogs_ids, only_text_reblogs=True):
+	def get_post_notes(self, blogs_ids, only_text_reblogs=True):
 		"""
 		Gets the post notes.
-		:param di_blogs_ids, dict: A dictionary with blog names as keys and post IDs as values.
+		:param blogs_ids, dict: A dictionary with blog names as keys and post IDs as values.
 		:param only_text_reblogs, bool: Whether to only keep notes that are text reblogs.
 		"""
 		# List of dict to get reblogs. Items are: [{"blog_name": post_id}]
@@ -588,14 +610,14 @@ def get_post_notes(self, di_blogs_ids, only_text_reblogs=True):
 		max_date = None
 
 		# Do some counting
-		len_blogs = len(di_blogs_ids)
+		len_blogs = len(blogs_ids)
 		count = 0
 
 		# Stop trying to fetch the notes after this many retries
 		max_notes_retries = 10
 		notes_retries = 0
 
-		for key, value in di_blogs_ids.items():
+		for key, value in blogs_ids.items():
 
 			count += 1
 
@@ -653,7 +675,7 @@ def get_post_by_id(self, blog_name, post_id):
 			raise ProcessorInterruptedException("Interrupted while fetching post from Tumblr")
 
 		# Request the specific post.
-		post = self.client.posts(blog_name, id=post_id)
+		post = self.client.posts(blog_name, id=post_id, npf=True)
 
 		# Tumblr API can sometimes return with this kind of error:
 		# {'meta': {'status': 500, 'msg': 'Server Error'}, 'response': {'error': 'Malformed JSON or HTML was returned.'}}
@@ -740,120 +762,112 @@ def validate_query(query, request, user):
 		del query["daterange"]
 
 		query["query"] = items
-		query["board"] = query.get("search_scope") + "s"  # used in web interface
 
 		# if we made it this far, the query can be executed
 		return query
 
-	def parse_tumblr_posts(self, posts, reblog=False):
+	@staticmethod
+	def map_item(post):
 		"""
-		Function to parse Tumblr posts into the same dict items.
+		Parse Tumblr posts.
 		Tumblr posts can be many different types, so some data processing is necessary.
 
-		:param posts, list: List of Tumblr posts as returned form the Tumblr API.
-		:param reblog, bool: Whether the post concerns a reblog of posts from the original dataset.
-
-		returns list processed_posts, a list with dictionary items of post info.
+		:param posts, list:		List of Tumblr posts as returned form the Tumblr API.
+		:param reblog, bool:	Whether the post concerns a reblog of posts from the original dataset.
+	
+		:return dict:			Mapped item 
 		"""
 
-		# Store processed posts here
-		processed_posts = []
-
-		media_tags = ["photo", "video", "audio"]
-
-		# Loop through all the posts and write a row for each of them.
-		for post in posts:
-			post_type = post["type"]
-
-			# The post's text is in different keys depending on the post type
-			if post_type in media_tags:
-				text = post["caption"]
-			elif post_type == "link":
-				text = post["description"]
-			elif post_type == "text" or post_type == "chat":
-				text = post["body"]
-			elif post_type == "answer":
-				text = post["question"] + "\n" + post["answer"]
-			else:
-				text = ""
-
-			# Different options for video types (YouTube- or Tumblr-hosted)
-			if post_type == "video":
-
-				video_source = post["video_type"]
-				# Use `get` since some videos are deleted
-				video_url = post.get("permalink_url")
-
-				if video_source == "youtube":
-					# There's no URL if the YouTube video is deleted
-					if video_url:
-						video_id = post["video"]["youtube"]["video_id"]
-					else:
-						video_id = "deleted"
-				else:
-					video_id = "unknown"
-
-			else:
-				video_source = None
-				video_id = None
-				video_url = None
-
-			# All the fields to write
-			processed_post = {
-				# General columns
-				"type": post_type,
-				"timestamp": post["timestamp"],
-				"is_reblog": reblog,
-
-				# Blog columns
-				"author": post["blog_name"],
-				"subject": post["blog"]["title"],
-				"blog_description": post["blog"]["description"],
-				"blog_url": post["blog"]["url"],
-				"blog_uuid": post["blog"]["uuid"],
-				"blog_last_updated": post["blog"]["updated"],
-
-				# Post columns
-				"id": post["id"],
-				"post_url": post["post_url"],
-				"post_slug": post["slug"],
-				"thread_id": post["reblog_key"],
-				"body": text.replace("\x00", ""),
-				"tags": ", ".join(post["tags"]) if post.get("tags") else None,
-				"notes": post["note_count"],
-				"urls": post.get("link_url"),
-				"images": ",".join([photo["original_size"]["url"] for photo in post["photos"]]) if post.get("photos") else None,
-
-				# Optional video columns
-				"video_source": video_source if post_type == "video" else None,
-				"video_url": video_url if post_type == "video" else None,
-				"video_id": video_id if post_type == "video" else None,
-				"video_thumb": post.get("thumbnail_url"), # Can be deleted
-
-				# Optional audio columns
-				"audio_type": post.get("audio_type"),
-				"audio_url": post.get("audio_source_url"),
-				"audio_plays": post.get("plays"),
-
-				# Optional link columns
-				"link_author": post.get("link_author"),
-				"link_publisher": post.get("publisher"),
-				"link_image": post.get("link_image"),
-
-				# Optional answers columns
-				"asking_name": post.get("asking_name"),
-				"asking_url": post.get("asking_url"),
-				"question": post.get("question"),
-				"answer": post.get("answer"),
-
-				# Optional chat columns
-				"chat": post.get("dialogue")
-			}
-
-			# Store the processed post
-			processed_posts.append(processed_post)
-
-		return processed_posts
+		media_types = ["photo", "video", "audio"]
+
+		# We're getting info as Neue Text Format types,
+		# so we need to loop through some 'blocks'.
+		image_urls = []
+		video_urls = []
+		video_thumb_urls = []
+		audio_urls = []
+		audio_artists = []
+		linked_urls = []
+		question = ""
+		answers = ""
+		raw_text = ""
+		formatted_text = []
+
+		# Loop through "blocks"
+		for block in post.get("content", []):
+			block_type = block["type"]
+
+			if block_type == "image":
+				image_urls.append(block["media"][0]["url"])
+			elif block_type == "audio":
+				audio_urls.append(block["media"]["url"])
+				audio_artists.append(block["artist"])
+			elif block_type == "video":
+				video_urls.append(block["media"]["url"])
+				if "filmstrip" in block:
+					video_thumb_urls.append(block["filmstrip"]["url"])
+			elif block_type == "link":
+				linked_urls.append(block["url"])
+			elif block_type == "poll":
+				question += block["question"]
+				answers = [a["answer_text"] for a in block["answers"]]
+
+			# We're gonna add some formatting to the text
+			elif block_type == "text":
+
+				text = block["text"]
+
+				extra_chars = 0
+				if block.get("formatting"):
+					for fmt in block["formatting"]:
+						
+						fmt_type = fmt["type"]
+						s = fmt["start"] + extra_chars	# Start of formatted substring
+						e = fmt["end"] + extra_chars	# End of formatted substring
+
+						if fmt_type == "link":
+							text = text[:s] + "[" + text[s:e] + "](" + fmt["formatting"]["url"] + ")" + text[e:]
+							extra_chars += 4 + len(fmt["formatting"]["url"])
+						elif fmt_type == "italic":
+							text = text[:s] + "*" + text[s:e] + "*" + text[e:]
+							extra_chars += 2
+						elif fmt_type == "bold":
+							text = text[:s] + "**" + text[s:e] + "**" + text[e:]
+							extra_chars += 4
+
+				if block.get("subtype") == "unordered-list-item":
+					text = "- " + text
+
+				raw_text += block["text"] + "\n"
+				formatted_text.append(text)
+
+		return MappedItem({
+			"id": post["id"],
+			"author": post["author_name"],
+			"thread_id": post["reblog_key"],
+			"timestamp": post["timestamp"],
+			"author_subject": post["author"]["title"],
+			"author_description": strip_tags(post["author"]["description"]),
+			"author_url": post["author"]["url"],
+			"author_uuid": post["author"]["uuid"],
+			"author_last_updated": post["author"]["updated"],
+			"author_post_url": post["author_post_url"],
+			"author_post_slug": post["author_post_slug"],
+			"body": raw_text,
+			"body_markdown": "\n".join(formatted_text),
+			"tags": ",".join(post["tags"]) if post.get("tags") else "",
+			"notes": post["note_count"],
+			"linked_urls": ",".join(linked_urls) if linked_urls else "",
+			"image_urls": ",".join(image_urls) if image_urls else "",
+			"video_urls": ",".join(video_urls) if video_urls else "",
+			"video_thumb_urls": ",".join(video_thumb_urls) if video_thumb_urls else "",
+			"audio_urls": ",".join(audio_urls) if audio_urls else "",
+			"audio_artist": ",".join(audio_artists) if audio_artists else "",
+			"author_asking_name": post.get("asking_name", ""),
+			"author_asking_url": post.get("asking_url", ""),
+			"poll_question": question,
+			"poll_answers": ",".join(answers)
+		})
 
 	def after_process(self):
 		"""

From 10c885376ff1ad63cbc7011d4d777a9b8d2d5d56 Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Mon, 15 Jul 2024 21:21:25 +0200
Subject: [PATCH 088/204] Fix notes fetching for Tumblr, add extra notes
 metrics to NDJSONs and `map_items`

---
 datasources/tumblr/DESCRIPTION.md   |   4 +-
 datasources/tumblr/search_tumblr.py | 251 ++++++++++++++--------------
 2 files changed, 127 insertions(+), 128 deletions(-)

diff --git a/datasources/tumblr/DESCRIPTION.md b/datasources/tumblr/DESCRIPTION.md
index a2be57d25..8269204a1 100644
--- a/datasources/tumblr/DESCRIPTION.md
+++ b/datasources/tumblr/DESCRIPTION.md
@@ -7,10 +7,12 @@ Be aware that the data may contain personal information. It is thus recommended
 To comply with the Tumblr API requirements, Tumblr datasets are deleted after three days. 
 
 ### Rate limits
-4CAT uses an internal API key to get Tumblr posts. These are limited to the
+If set, 4CAT uses an internal API key to get Tumblr posts. These are limited to the
 [following rate limits](https://www.tumblr.com/docs/en/api/v2#rate-limits). However, administrators
 may request a rate limit increase via Tumblr.
 
+If no internal API key is set, you can insert your own.
+
 ### Date bugs
 The [Tumblr API](https://api.tumblr.com) is volatile: when fetching sporadically used 
 tags, it may return zero posts, even though older posts *do* exist. Check the oldest post in 
diff --git a/datasources/tumblr/search_tumblr.py b/datasources/tumblr/search_tumblr.py
index ae8876a83..bf64702c2 100644
--- a/datasources/tumblr/search_tumblr.py
+++ b/datasources/tumblr/search_tumblr.py
@@ -110,9 +110,10 @@ def get_options(cls, parent_dataset=None, user=None):
 				"help": "Tags/blogs",
 				"tooltip": "Separate with commas or new lines."
 			},
-			"fetch_reblogs": {
+			"get_notes": {
 				"type": UserInput.OPTION_TOGGLE,
-				"help": "Also fetch reblogs with text? (warning: slow)",
+				"help": "Get post notes (warning: slow)",
+				"tooltip": "Also retrieve post notes. Likes and replies are added to the original post. Text reblogs are added as new posts.",
 				"default": False
 			}
 		}
@@ -183,8 +184,7 @@ def get_items(self, query):
 		parameters = self.dataset.get_parameters()
 		scope = parameters.get("search_scope", "")
 		queries = parameters.get("query").split(", ")
-		fetch_reblogs = parameters.get("fetch_reblogs", False)
-
+		get_notes = parameters.get("get_notes", False)
 
 		# Store all info here
 		results = []
@@ -216,7 +216,6 @@ def get_items(self, query):
 			self.dataset.finish_with_error(f"Could not connect to Tumblr API: {client_info.get('meta', {}).get('status', '')} - {client_info.get('meta', {}).get('msg', '')}")
 			return
 
-
 		# for each tag or blog, get post
 		for query in queries:
 
@@ -231,8 +230,8 @@ def get_items(self, query):
 
 				# Get posts per blog
 				elif scope == "blog":
-					new_results, notes = self.get_posts_by_blog(query, max_date=max_date, min_date=min_date)
-					all_notes.append(notes)
+					new_results = self.get_posts_by_blog(query, max_date=max_date, min_date=min_date)
+
 				else:
 					self.dataset.update_status("Invalid scope")
 					break
@@ -246,61 +245,35 @@ def get_items(self, query):
 					self.dataset.update_status("API limit reached")
 					break
 
-		# If we also want the posts that reblogged the fetched posts:
-		if fetch_reblogs and not self.max_posts_reached and not self.api_limit_reached:
-			self.dataset.update_status("Getting notes from all posts")
-
-			# Reblog information is already returned for blog-level searches
-			if scope == "blog":
-				text_reblogs = []
-
-				# Loop through and add the text reblogs that came with the results.
-				for post_notes in all_notes:
-					for post_note in post_notes:
-						for note in post_note:
-							if note["type"] == "reblog":
-								text_reblogs.append({note["blog_name"]: note["post_id"]})
-
-			# Retrieving notes for tag-based posts should be done one-by-one.
-			# Fetching them all at once is not supported by the Tumblr API.
-			elif scope == "tag":
-				# Prepare dicts to pass to `get_post_notes`
-				posts_to_fetch = {result["author"]: result["id"] for result in results}
-
-				# First extract the notes of each post, and only keep text reblogs
-				text_reblogs = self.get_post_notes(posts_to_fetch)
-
-			# Get the full data for text reblogs.
-			if text_reblogs:
-				connection_retries = 0
-				for i, text_reblog in enumerate(text_reblogs):
-					self.dataset.update_status("Got %i/%i text reblogs" % (i, len(text_reblogs)))
-					if connection_retries >= 5:
-						self.dataset.update_status("Multiple connection refused errors; unable to continue collection of reblogs.")
-						break
-					for key, value in text_reblog.items():
-						if connection_retries >= 5:
-							break
-						try:
-							reblog_post = self.get_post_by_id(key, value)
-						except ConnectionRefusedError:
-							connection_retries += 1
-							self.failed_reblogs.append(key)
-							self.dataset.update_status(f"ConnectionRefused: Unable to collect reblogs for post {key}")
-							continue
-						if reblog_post:
-							results.append(reblog_post[0])
-
-		# Rename some keys so it works with anonymisation
-		for i in range(len(results)):
-			for key in list(results[i].keys()):
-				if key.startswith("blog"):
-					results[i][key.replace("blog", "author")] = results[i].pop(key)
-				elif key == "post_url":
-					results[i]["author_post_url"] = results[i].pop("post_url")
-				elif key == "slug":
-					results[i]["author_post_slug"] = results[i].pop("slug")
+		# Loop through the results once to add note data and fetch text reblogs,
+		len_results = len(results) # results will change in length when we add reblogs.
+		for i in range(len_results):
+
+			post = results[i]
+
+			# Get note information
+			if get_notes and not self.max_posts_reached and not self.api_limit_reached:
+
+				# Reblog information is already returned for blog-level searches
+				# and is stored as `notes` in the posts themselves.
+				# Retrieving notes for tag-based posts must be done one-by-one;
+				# fetching them all at once is not supported by the Tumblr API.
+				if not "notes" in post:
+					self.dataset.update_status("Getting note data for post %i/%i" % (i, len_results))
+
+					# Prepare dicts to pass to `get_post_notes`
+					notes = self.get_post_notes(post["blog_name"], post["id"])
+
+					if notes:
+						results[i]["notes"] = notes
 
+					# Get the full data for text reblogs and add them as new posts
+					for note in notes:
+						if note["type"] == "reblog":
+							text_reblog = self.get_post_by_id(note["blog_name"], note["post_id"])
+							if text_reblog:
+								results.append(text_reblog)
+		
 		self.job.finish()
 		return results
 
@@ -353,7 +326,8 @@ def get_posts_by_tag(self, tag, max_date=None, min_date=None, api_key=None):
 					"before": max_date,
 					"limit": 20,
 					"filter": "raw",
-					"npf": True
+					"npf": True,
+					"notes_info": True
 				}
 				url = "https://api.tumblr.com/v2/tagged"
 				response = requests.get(url, params=params)
@@ -528,9 +502,6 @@ def get_posts_by_blog(self, blog, max_date=None, min_date=None):
 		# Store all posts in here
 		all_posts = []
 
-		# Store notes here, if they exist and are requested
-		all_notes = []
-
 		# Some retries to make sure the Tumblr API actually returns everything
 		retries = 0
 		self.max_retries = 48 # 2 days
@@ -565,11 +536,6 @@ def get_posts_by_blog(self, blog, max_date=None, min_date=None):
 
 			# Append posts to main list
 			else:
-				# Keep the notes, if so indicated
-				if self.parameters.get("fetch_reblogs"):
-					for post in posts:
-						if "notes" in post:
-							all_notes.append(post["notes"])
 
 				# Get the lowest date
 				max_date = sorted([post["timestamp"] for post in posts])[0]
@@ -596,72 +562,61 @@ def get_posts_by_blog(self, blog, max_date=None, min_date=None):
 
 			self.dataset.update_status("Collected %s posts" % str(len(all_posts)))
 
-		return all_posts, all_notes
+		return all_posts
 
-	def get_post_notes(self, blogs_ids, only_text_reblogs=True):
-		"""
-		Gets the post notes.
-		:param blogs_ids, dict: A dictionary with blog names as keys and post IDs as values.
-		:param only_text_reblogs, bool: Whether to only keep notes that are text reblogs.
+	def get_post_notes(self, blog_id, post_id):
 		"""
-		# List of dict to get reblogs. Items are: [{"blog_name": post_id}]
-		text_reblogs = []
+		Gets data on the notes of a specific post.
+		:param blog_id, str: The ID of the blog.
+		:param post_id, str: The ID of the post.
 
+		:returns: a list with dictionaries of notes.
+		"""
+		
+		post_notes = []
 		max_date = None
 
 		# Do some counting
-		len_blogs = len(blogs_ids)
 		count = 0
 
 		# Stop trying to fetch the notes after this many retries
 		max_notes_retries = 10
 		notes_retries = 0
 
-		for key, value in blogs_ids.items():
-
-			count += 1
-
-			if self.interrupted:
-				raise ProcessorInterruptedException("Interrupted while fetching post notes from Tumblr")
+		count += 1
 
-			# First, get the blog names and post_ids from reblogs
-			# Keep digging till there's nothing left, or if we can fetch no new notes
-			while True:
-
-				# Requests a post's notes
-				notes = self.client.notes(key, id=value, before_timestamp=max_date)
-
-				if only_text_reblogs:
+		if self.interrupted:
+			raise ProcessorInterruptedException("Interrupted while fetching post notes from Tumblr")
 
-					if "notes" in notes:
-						notes_retries = 0
+		while True:
 
-						for note in notes["notes"]:
-							# If it's a reblog, extract the data and save the rest of the posts for later
-							if note["type"] == "reblog":
-								if note.get("added_text"):
-									text_reblogs.append({note["blog_name"]: note["post_id"]})
+			# Requests a post's notes
+			notes = self.client.notes(blog_id, id=post_id, before_timestamp=max_date)
+			
+			if "notes" in notes:
+				notes_retries = 0
 
-						if notes.get("_links"):
-							max_date = notes["_links"]["next"]["query_params"]["before_timestamp"]
+				for note in notes["notes"]:
+					post_notes.append(note)
 
-						# If there's no `_links` key, that's all.
-						else:
-							break
+				if notes.get("_links"):
+					max_date = notes["_links"]["next"]["query_params"]["before_timestamp"]
 
-					# If there's no "notes" key in the returned dict, something might be up
-					else:
-						self.dataset.update_status("Couldn't get notes for Tumblr request " + str(notes))
-						notes_retries += 1
-						pass
+				# If there's no `_links` key, that's all.
+				else:
+					break
 
-					if notes_retries > max_notes_retries:
-						self.failed_notes.append(key)
-						break
+			# If there's no "notes" key in the returned dict, something might be up
+			else:
+				self.dataset.update_status("Couldn't get notes for Tumblr post " + str(post_id))
+				notes_retries += 1
+				pass
 
-			self.dataset.update_status("Identified %i text reblogs in %i/%i notes" % (len(text_reblogs), count, len_blogs))
+			if notes_retries > max_notes_retries:
+				self.failed_notes.append(post_id)
+				break
 
-		return text_reblogs
+		return post_notes
 
 	def get_post_by_id(self, blog_name, post_id):
 		"""
@@ -674,12 +629,28 @@ def get_post_by_id(self, blog_name, post_id):
 		if self.interrupted:
 			raise ProcessorInterruptedException("Interrupted while fetching post from Tumblr")
 
-		# Request the specific post.
-		post = self.client.posts(blog_name, id=post_id, npf=True)
+		connection_retries = 0
+
+		while True:
+			if connection_retries >= 5:
+				self.dataset.update_status("Multiple connection refused errors; unable to continue collection of reblogs.")
+				break
+			try:
+				# Request the specific post.
+				post = self.client.posts(blog_name, id=post_id, npf=True)
+		
+			except ConnectionRefusedError:
+				connection_retries += 1
+				self.failed_reblogs.append(note["id"])
+				self.dataset.update_status(f"ConnectionRefused: Unable to collect reblogs for post " + note["id"])
+				continue
+			
+			if post:
+				break
 
 		# Tumblr API can sometimes return with this kind of error:
 		# {'meta': {'status': 500, 'msg': 'Server Error'}, 'response': {'error': 'Malformed JSON or HTML was returned.'}}
-		if "posts" not in post:
+		if not post or "posts" not in post:
 			return None
 
 		# Get the first element of the list - it's always one post.
@@ -780,7 +751,7 @@ def map_item(post):
 
 		media_types = ["photo", "video", "audio"]
 
-		# We're getting info as Neue Text Format types,
+		# We're getting info as Neue Post Format types,
 		# so we need to loop through some 'blocks'.
 		image_urls = []
 		video_urls = []
@@ -792,6 +763,10 @@ def map_item(post):
 		answers = ""
 		raw_text = ""
 		formatted_text = []
+		authors_liked = []
+		authors_reblogged = []
+		authors_replied = []
+		replies = []
 
 		# Loop through "blocks"
 		for block in post.get("content", []):
@@ -841,22 +816,44 @@ def map_item(post):
 				raw_text += block["text"] + "\n"
 				formatted_text.append(text)
 
+		# Add note data
+		for note in post.get("notes", []):
+			if note["type"] == "like":
+				authors_liked.append(note["blog_name"])
+			elif note["type"] in ("posted", "reblog"):
+				# If the original post is a text reblog, it will also show up in the notes.
+				# We can skip these since the data is already in the main post dict.
+				if note["blog_name"] != post["blog_name"] and note["timestamp"] != post["timestamp"]:
+					authors_reblogged.append(note["blog_name"])
+			elif note["type"] == "reply":
+				authors_replied.append(note["blog_name"])
+				replies.append(note["reply_text"])
+
 		return MappedItem({
+			"type": post["original_type"] if "original_type" in post else post["type"],
 			"id": post["id"],
-			"author": post["author_name"],
+			"author": post["blog_name"],
 			"thread_id": post["reblog_key"],
 			"timestamp": post["timestamp"],
-			"author_subject": post["author"]["title"],
-			"author_description": strip_tags(post["author"]["description"]),
-			"author_url": post["author"]["url"],
-			"author_uuid": post["author"]["uuid"],
-			"author_last_updated": post["author"]["updated"],
-			"author_post_url": post["author_post_url"],
-			"author_post_slug": post["author_post_slug"],
+			"author_subject": post["blog"]["title"],
+			"author_description": strip_tags(post["blog"]["description"]),
+			"author_url": post["blog"]["url"],
+			"author_uuid": post["blog"]["uuid"],
+			"author_last_updated": post["blog"]["updated"],
+			"author_post_url": post["post_url"],
+			"author_post_slug": post["slug"],
+			"is_reblog": True if post.get("original_type") == "note" else "",
 			"body": raw_text,
 			"body_markdown": "\n".join(formatted_text),
 			"tags": ",".join(post["tags"]) if post.get("tags") else "",
 			"notes": post["note_count"],
+			"like_count": len(authors_liked),
+			"authors_liked": ",".join(authors_liked),
+			"reblog_count": len(authors_reblogged),
+			"authors_reblogged": ",".join(authors_reblogged),
+			"reply_count": len(authors_replied),
+			"authors_replied": ",".join(authors_replied),
+			"replies": "\n\n".join(replies),
 			"linked_urls": ",".join(linked_urls) if linked_urls else "",
 			"image_urls": ",".join(image_urls) if image_urls else "",
 			"video_urls": ",".join(video_urls) if video_urls else "",

From ce16f972237c65185b6e2a51d87980c588aff2ff Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Tue, 16 Jul 2024 21:48:30 +0200
Subject: [PATCH 089/204] Make Tumblr search work with new blocks formatting,
 include some new content

---
 datasources/tumblr/search_tumblr.py | 43 ++++++++++++++++++++++-------
 1 file changed, 33 insertions(+), 10 deletions(-)

diff --git a/datasources/tumblr/search_tumblr.py b/datasources/tumblr/search_tumblr.py
index bf64702c2..9b2c89314 100644
--- a/datasources/tumblr/search_tumblr.py
+++ b/datasources/tumblr/search_tumblr.py
@@ -759,6 +759,7 @@ def map_item(post):
 		audio_urls = []
 		audio_artists = []
 		linked_urls = []
+		linked_titles = []
 		question = ""
 		answers = ""
 		raw_text = ""
@@ -768,8 +769,18 @@ def map_item(post):
 		authors_replied = []
 		replies = []
 
+		# Keep track if blocks belong to another post,
+		# which is stored in `layout`.
+		body_reblogged = []
+		reblogged_text_blocks = []
+		author_reblogged = ""
+		for layout_block in post.get("layout", []):
+			if layout_block["type"] == "ask":
+				reblogged_text_blocks += layout_block["blocks"]
+				author_reblogged = layout_block["attribution"]["blog"]["name"]
+
 		# Loop through "blocks"
-		for block in post.get("content", []):
+		for i, block in enumerate(post.get("content", [])):
 			block_type = block["type"]
 
 			if block_type == "image":
@@ -783,12 +794,18 @@ def map_item(post):
 					video_thumb_urls.append(block["filmstrip"]["url"])
 			elif block_type == "link":
 				linked_urls.append(block["url"])
+				if "title" in block:
+					linked_titles.append(block["title"])
+				if "description" in block:
+					raw_text += block["description"] + "\n"
+					formatted_text.append(block["description"])
 			elif block_type == "poll":
 				question += block["question"]
 				answers = [a["answer_text"] for a in block["answers"]]
 
 			# We're gonna add some formatting to the text
-			elif block_type == "text":
+			# Skip text that is part of a reblogged post.
+			elif block_type == "text" and i not in reblogged_text_blocks:
 
 				text = block["text"]
 
@@ -816,23 +833,28 @@ def map_item(post):
 				raw_text += block["text"] + "\n"
 				formatted_text.append(text)
 
+			elif block_type == "text" and i in reblogged_text_blocks:
+				body_reblogged.append(block["text"])
+
 		# Add note data
 		for note in post.get("notes", []):
 			if note["type"] == "like":
-				authors_liked.append(note["blog_name"])
+				# Inserting at the start of the list to maintain chronological order.
+				authors_liked.insert(0, note["blog_name"])
 			elif note["type"] in ("posted", "reblog"):
 				# If the original post is a text reblog, it will also show up in the notes.
 				# We can skip these since the data is already in the main post dict.
 				if note["blog_name"] != post["blog_name"] and note["timestamp"] != post["timestamp"]:
-					authors_reblogged.append(note["blog_name"])
+					authors_reblogged.insert(0, note["blog_name"])
 			elif note["type"] == "reply":
-				authors_replied.append(note["blog_name"])
-				replies.append(note["reply_text"])
+				authors_replied.insert(0, note["blog_name"])
+				replies.insert(0, note["blog_name"] + ": " + note["reply_text"])
 
 		return MappedItem({
 			"type": post["original_type"] if "original_type" in post else post["type"],
 			"id": post["id"],
 			"author": post["blog_name"],
+			"author_avatar_url": "https://api.tumblr.com/v2/blog/" + post["blog_name"] + "/avatar",
 			"thread_id": post["reblog_key"],
 			"timestamp": post["timestamp"],
 			"author_subject": post["blog"]["title"],
@@ -840,11 +862,13 @@ def map_item(post):
 			"author_url": post["blog"]["url"],
 			"author_uuid": post["blog"]["uuid"],
 			"author_last_updated": post["blog"]["updated"],
-			"author_post_url": post["post_url"],
-			"author_post_slug": post["slug"],
+			"post_url": post["post_url"],
+			"post_slug": post["slug"],
 			"is_reblog": True if post.get("original_type") == "note" else "",
 			"body": raw_text,
 			"body_markdown": "\n".join(formatted_text),
+			"body_reblogged": "\n".join(body_reblogged) if body_reblogged else "",
+			"author_reblogged": author_reblogged,
 			"tags": ",".join(post["tags"]) if post.get("tags") else "",
 			"notes": post["note_count"],
 			"like_count": len(authors_liked),
@@ -855,13 +879,12 @@ def map_item(post):
 			"authors_replied": ",".join(authors_replied),
 			"replies": "\n\n".join(replies),
 			"linked_urls": ",".join(linked_urls) if linked_urls else "",
+			"linked_urls_titles": "\n".join(linked_titles) if linked_titles else "",
 			"image_urls": ",".join(image_urls) if image_urls else "",
 			"video_urls": ",".join(video_urls) if video_urls else "",
 			"video_thumb_urls": ",".join(video_thumb_urls) if video_thumb_urls else "",
 			"audio_urls": ",".join(audio_urls) if audio_urls else "",
 			"audio_artist": ",".join(audio_artists) if audio_artists else "",
-			"author_asking_name": post.get("asking_name", ""),
-			"author_asking_url": post.get("asking_url", ""),
 			"poll_question": question,
 			"poll_answers": ",".join(answers)
 		})

From e392544eed6464613a9e429bb5e22df56bb6e887 Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Tue, 16 Jul 2024 21:48:52 +0200
Subject: [PATCH 090/204] Tumblr Explorer Template v0.5

---
 webtool/static/css/explorer/tumblr.css        | 250 +++++++++++++++---
 .../explorer/datasource-templates/tumblr.html | 131 +++++++++
 2 files changed, 346 insertions(+), 35 deletions(-)
 create mode 100644 webtool/templates/explorer/datasource-templates/tumblr.html

diff --git a/webtool/static/css/explorer/tumblr.css b/webtool/static/css/explorer/tumblr.css
index e3ef2eaa8..1937bb3c9 100644
--- a/webtool/static/css/explorer/tumblr.css
+++ b/webtool/static/css/explorer/tumblr.css
@@ -1,68 +1,248 @@
+/* General stuff */
 .explorer-content {
 	background-color: #001935;
+	padding: 20px
 }
 
-#metadata, footer {
-	color: white;
+.posts li.post {
+	position: relative;
+	list-style-type: none;
+	font-family: Helvetica, sans-serif;
+	background-color: white;
+	color: black;
+	font-size: 16px;
+	margin: 0 auto;
+	margin-top: 20px;
+	padding: 0px;
+	border-radius: 8px;
+	max-width: 540px;
 }
 
-.content {
-	font-family: "Favorit", "Helvetica Neue", "HelveticaNeue", Helvetica, Arial, sans-serif;
+.author {
+	font-size: 13px;
+	font-weight: bold;
 }
 
-.posts li.post {
-	background-color: white;
-	color: black;
-	font-size: 14px;
-	left: 0;
+.author-avatar {
+	width: 32px;
+	margin-right: 10px;
+}
+
+.author-avatar img {
 	border-radius: 3px;
-	max-width: 540px;
-	padding: 0;
+	width: 100%;
 }
 
-.posts li.post header {
-	display: inline-block;
+/* Main author info  */
+header {
+	display: flex;
+	align-items: center;
+	padding: 19px;
 	text-decoration: none;
-	font-weight: bold;
-	border: none;
-	padding: 0px;
-	line-height: 1.7em;
-	margin: 25px;
+	color: black;
+	overflow: hidden;
+}
+
+header a {
+	color: black;
+}
+
+header .author-avatar {
+	display: inline-block;
+}
+
+header .author {
+	display: inline-block;
+}
+
+/* Media */
+.media-container {
+	width: 100%;
 	margin-bottom: 10px;
 }
 
-.posts li.post article {
-	padding: 0;
-	margin: 0;
+.media-container img {
+	width: 100%;
 }
 
-.posts li.post .post-content {
+/* Post text content */
+.post-content {
 	display: block;
-	margin: 25px;
-	margin-top: 0px;
+	padding: 0px 19px 0px 19px;
 }
 
-.posts li.post .post-tags {
+.post-content .body, .body-reblogged {
+	white-space: pre-wrap;
+	line-height: 1.5em;
+}
+
+.reblogged-content {
+	margin-bottom: 19px;
+	display: inline-block;
+	max-width: 400px;
+	padding: 25px;
+	background-color: #ededed;
+}
+
+.author-reblogged {
+	padding-bottom: 3px;
+}
+
+.author-reblogged-avatar {
+	display: inline-block;
+}
+
+.embedded-link {
+	padding: 30px;
+	background-color: #001935;
+	color: white;
+	text-align: center;
+	font-size: 18px;
+	border-radius: 15px;
+	margin-bottom: 19px;
+}
+
+.embedded-link a {
+	color: white;
+}
+
+.poll-question {
+	font-size: 18px;
+}
+
+.poll-answer {
+	color: white;
+	background-color: #001935;
+	margin: 8px;
+    padding: 8px;
+    border-radius: 15px;
+    text-align: center;
+}
+
+.posts .external-url {
+	position: absolute;
+	bottom: 0;
+	right: 0;
+	padding: 10px;
+}
+
+.tags {
+	padding-top: 19px;
+	list-style-type: none;
 	color: #5e5e5e;
-	margin-top: 20px;
 	word-break: break-all;
 }
 
-.posts li.post .author {
+.tags a {
+	color: #5e5e5e;
+}
+
+.tags li {
+	padding: 5px 5px 5px 0px;
+	display: inline-block;
+	background-color: white;
+}
+
+/* Post footer */
+footer {
+	margin: 19px;
+	padding-top: 19px;
+	border-top: 1px solid rgba(0,0,0,0.13);
+}
+
+.time {
+	color: #5e5e5e;
+}
+
+/* Note metrics  */
+.notes {
+}
+
+.note-counts {
+	padding-top: 19px;	
+}
+
+.note-count {
+	display: inline-block;
+	color: #5a5a5a;
+	border-radius: 18px;
+	border: 1px solid #ebebeb;
+	padding: 9px 18px;
+}
+
+.note-count.total {
 	font-weight: bold;
 }
 
-.posts li.post .post-image {
-	width: 100%;
-	margin-bottom: 15px;
+/* Replies */
+.replies {
+	margin-top: 12px;
+	display: table;
+}
+
+.reply {
+	background-color: white;
+	display: table-row;
+}
+
+.reply .author-info {
+	display: table-cell;
+}
+
+.reply .author-replied-avatar {
+	vertical-align: middle;
+    display: table-cell;
+    padding-right: 10px;
+}
+
+.reply-content {
+	vertical-align: top;
+	margin-top: 5px;
+	margin-bottom: 5px;
+	border-radius: 18px;
+	border: 1px solid #ebebeb;
+	padding: 9px 18px;
 }
 
-.posts li.post .external-url {
-	
+.reply-content .author {
+	margin-bottom: 5px;
 }
 
-.posts li.post .post-annotations {
+/* Annotation fields */
+.post-annotations {
 	background-color: white;
-	border-top: 1px solid #5e5e5e;
-	margin-right: 0;
+    color: black;
+    border-radius: 8px;
+	border-top: 1px solid #ebebeb;
+}
+
+.post-annotation {
+    padding: 15px;
+}
+
+.post-annotation input {
+    border-radius: 5px;
+}
+
+.post-annotation > .annotation-label {
+    display: inline-block;
+    vertical-align: middle;
+    text-align: right;
+    min-width: 150px;
+    margin-right: 5px;
+    line-height: 1.6em;
+    overflow-x: hidden;
+}
+
+.post-annotation.checkbox > .post-annotation-options {
+    display: inline-block;
+}
+
+.post-annotation-options {
+    display: inline-block;
+    vertical-align: top;
+}
+
+.post-annotation-options > input {
+    display: inline-block;
 }
\ No newline at end of file
diff --git a/webtool/templates/explorer/datasource-templates/tumblr.html b/webtool/templates/explorer/datasource-templates/tumblr.html
new file mode 100644
index 000000000..fec6a42ed
--- /dev/null
+++ b/webtool/templates/explorer/datasource-templates/tumblr.html
@@ -0,0 +1,131 @@
+<header>
+	{% if not pseudonymised %}
+		<!-- Possible external link, if not pseudonymised -->
+		<a href="{{ post['post_url'] }}" target="_blank"><span class="external-url" title="Go to original post"><i class="fas fa-external-link-alt"></i></span></a>
+		<!-- PF -->
+		{% if post["author_avatar_url"] %}
+		<div class="author-avatar">
+			<a href="{{ post.get('author_url') }}" target="_blank">
+				<img src="{{ post.get('author_avatar_url') }}">
+			</a>
+		</div>
+		{% endif %}
+		<!-- AUTHOR -->
+		<span class="author"><a href="{{ post.get('author_url') }}">{{ post.get("author") }}</a>
+		
+	{% else %}
+		<span title="Pseudonymous author" class="author">
+		<i class="fa fa-user-secret tooltip-trigger"></i>
+	{% endif %}
+
+</header>
+
+<!-- MEDIA -->
+{% if post["image_urls"] or post["video_urls"] %}
+	<div class="media-container">
+	{% if post["image_urls"] %}
+		{% for image_url in post["image_urls"].split(",") %}
+			<img src="{{ image_url }}">
+		{% endfor %}
+	{% elif post["video_thumb_urls"] %}
+		{% for video_thumb_url in post["video_thumb_urls"].split(",") %}
+			{% if not pseudonymised %}<a href="{{ post['post_url'] }}" target='blank'>{% endif %}
+			<div class="video-thumb"><img src="{{ video_thumb_url }}"></div>
+			<div class="play-button"><i class="fa-solid fa-play"></i></div>
+			{% if not pseudonymised %}</a>{% endif %}
+		{% endfor %}
+	{% endif %}
+</div>
+{% endif %}
+
+<div class="post-content">
+	<!-- REBLOGGED CONTENT -->
+	{% if post.get("body_reblogged") %}
+	<div class="reblogged-content">
+		<div class="reblog-content">
+			<div class="author author-reblogged">{% if not pseudonymised %}{{ post["author_reblogged"] }}{% else %}<i lass="fa fa-user-secret tooltip-trigger"></i>{% endif %}</div>
+			<div class="body-reblogged">{{ post["body_reblogged"] }}</div>
+		</div>
+	</div>
+	<div class="author-avatar author-reblogged-avatar">
+		{% if not pseudonymised %}
+			<img src="https://api.tumblr.com/v2/blog/{{ post.author_reblogged }}/avatar">
+		{% endif %}
+	</div>
+	{% endif %}
+	<!-- EMBEDDED LINKS -->
+	{% if post.get("linked_urls") %}
+		{% for url in post["linked_urls"].split(",") %}
+		<div class="embedded-link"><a href="{{ url }}" target="_blank">{{ post["linked_urls_titles"].split("\n")[ loop.index - 1 ] }} <i class="fas fa-external-link-alt"></i></a></div>
+		{% endfor %}
+	{% endif %}
+	<!-- POLL -->
+	{% if post.get("poll_question") %}
+	<div class="poll">
+		<div class="poll-question">{{ post["poll_question"] }}</div>
+		<ul class="poll-answers">
+			{% for poll_answer in post["poll_answers"].split(",") %}
+				<li class="poll-answer">{{ poll_answer }}</li>
+			{% endfor %}
+		</ul>
+
+	</div>
+	{% endif %}
+	<!-- POST BODY -->
+	<div class="body">{{ post.get("body_markdown") | social_mediafy(datasource='tumblr') | safe }}</div>
+	<!-- TAGS -->
+	{% if post.get("tags") %}
+	<div class="tags">
+		<ul class="tags-list">
+		{% for tag in post["tags"].split(",") %}
+			<a href="https://tumblr.com/tagged/{{ tag }}" target="_blank"><li class="tag">#{{ tag }}</li></a>
+		{% endfor %}
+		</ul>
+	</div>
+	{% endif %}
+</div>
+
+<footer>
+
+	<!-- DATE -->
+	<div class="time">Posted {{ post.timestamp | datetime(fmt="%d %b %Y, %H:%M", wrap=False) }} UTC</div>
+
+	<!-- NOTES -->
+	{% if post.notes %} 
+	<div class="notes">
+		<div class="note-counts">
+			<span class="note-count total">{{ post.get("notes") | commafy }} note{% if post.get("notes", 0) > 1 %}s{% endif %}</span>
+			<!-- REBLOGS -->
+			{% if post.get("reblog_count") %}
+				<span class="note-count reblog-count"><i class="fa-solid fa-retweet"></i> {{ post.reblog_count }}</span>
+			{% endif %}
+			<!-- LIKES -->
+			{% if post.get("like_count") %}
+				<span class="note-count like-count"><i class="fa-solid fa-heart"></i> {{ post.get("like_count") }}</span>
+			{% endif %}
+			<!-- REPLIES -->
+			{% if post.get("reply_count") %}
+				<span class="note-count reply-count"><i class="fa-solid fa-reply"></i> {{ post.get("reply_count") }}</span>
+			{% endif %}
+		</div>
+		{% if post.get("authors_replied") %}
+		<div class="replies">			
+			{% for author_replied in post.get("authors_replied").split(",") %}
+			<li class="reply">
+				<div class="author-avatar author-replied-avatar">
+					{% if not pseudonymised %}
+						<img src="https://api.tumblr.com/v2/blog/{{ author_replied }}/avatar">
+					{% endif %}
+				</div>
+				<div class="reply-content">
+					<div class="author author-replied">{% if not pseudonymised %}{{ author_replied }}{% else %}<i lass="fa fa-user-secret tooltip-trigger"></i>{% endif %}</div>
+					<div class="reply-text">{{ post.replies.split("\n\n")[ loop.index - 1 ].replace(author_replied + ": ", "") }}</div>
+				</div>
+			</li>
+			{% endfor %}
+		</div>
+		{% endif %}
+	</div>
+	{% endif %}
+
+</footer>
\ No newline at end of file

From 40f5fa09a015940b61b628a42799f05d69e4c33a Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Wed, 17 Jul 2024 17:48:30 +0200
Subject: [PATCH 091/204] Bump PyTumblr version

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index b75724a23..ad8eca1d0 100644
--- a/setup.py
+++ b/setup.py
@@ -44,7 +44,7 @@
 	"psycopg2~=2.9.0",
 	"pyahocorasick~=1.4.0",
 	"PyMySQL~=1.0",
-	"PyTumblr==0.1.0",
+	"PyTumblr==0.1.2",
 	"requests~=2.27",
 	"requests_futures",
 	"scenedetect==0.6.0.3",

From 9e486ad7fa3476cc934db5436929d4766b181f19 Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Wed, 17 Jul 2024 17:48:57 +0200
Subject: [PATCH 092/204] Dashes are okay for Tumblr Blog names

---
 webtool/lib/template_filters.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/webtool/lib/template_filters.py b/webtool/lib/template_filters.py
index 89a1e7336..3bc75ebb9 100644
--- a/webtool/lib/template_filters.py
+++ b/webtool/lib/template_filters.py
@@ -229,13 +229,13 @@ def _jinja2_filter_social_mediafy(body, datasource=""):
 		# replace tags that are a substring of another, longer tag.
 		tags = sorted(tags, key=lambda x: len(x), reverse=True)
 		for tag in tags:
-			# Match the string, but not if it's preceded by a >, which indicates that we've already added an <a> tag.
+			# Match the string, but not if it's preceded by a >, which indicates that we've already added an anchor tag.
 			# This avoids problems with repeated substrings (e.g. #Dog and #DogOwners).
 			body = re.sub(r"(?<!'>)(" + tag + ")", "<a href='%s' target='_blank'>%s</a>" % (base_urls[datasource]["hashtag"] + tag[1:], tag), body)
 
 	# Add @-mention links
 	if "mention"  in base_urls[datasource]:
-		mentions = re.findall(r"@[\w0-9]+", body)
+		mentions = re.findall(r"@[\w0-9-]+", body)
 		mentions = sorted(mentions, key=lambda x: len(x), reverse=True)
 		for mention in mentions:
 			body = re.sub(r"(?<!>)(" + mention + ")", "<a href='%s' target='_blank'>%s</a>" % (base_urls[datasource]["mention"] + mention[1:], mention), body)

From 9837a8a48102939c56e0f9743fc2a63566308b5a Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Wed, 17 Jul 2024 17:49:10 +0200
Subject: [PATCH 093/204] Better styling for Tumblr Explorer Template

---
 webtool/static/css/explorer/tumblr.css | 94 +++++++++++++++++---------
 1 file changed, 61 insertions(+), 33 deletions(-)

diff --git a/webtool/static/css/explorer/tumblr.css b/webtool/static/css/explorer/tumblr.css
index 1937bb3c9..cc33bc29d 100644
--- a/webtool/static/css/explorer/tumblr.css
+++ b/webtool/static/css/explorer/tumblr.css
@@ -57,14 +57,34 @@ header .author {
 
 /* Media */
 .media-container {
-	width: 100%;
-	margin-bottom: 10px;
+	position: relative;
+	margin: 3px -19px 3px -19px;
+	overflow-x: hidden;
 }
 
 .media-container img {
 	width: 100%;
 }
 
+.media-container.video img {
+	min-height: 300px;
+	width: auto;
+	filter: blur(1.5rem);
+}
+
+.play-button {
+	position: absolute;
+	width: 100%;
+	top: 38%;
+    left: 45%;
+	font-size: 80px;
+}
+
+.play-button i {
+	color: white;
+	opacity: .7;
+}
+
 /* Post text content */
 .post-content {
 	display: block;
@@ -76,6 +96,10 @@ header .author {
 	line-height: 1.5em;
 }
 
+.post-content .body {
+	padding: 3px 0px 3px 0px;
+}
+
 .reblogged-content {
 	margin-bottom: 19px;
 	display: inline-block;
@@ -107,27 +131,21 @@ header .author {
 }
 
 .poll-question {
-	font-size: 18px;
+	font-size: 20px;
+	padding: 3px 0px 3px 0px;
 }
 
 .poll-answer {
 	color: white;
 	background-color: #001935;
 	margin: 8px;
-    padding: 8px;
-    border-radius: 15px;
-    text-align: center;
-}
-
-.posts .external-url {
-	position: absolute;
-	bottom: 0;
-	right: 0;
-	padding: 10px;
+	padding: 8px;
+	border-radius: 15px;
+	text-align: center;
 }
 
 .tags {
-	padding-top: 19px;
+	padding-top: 5px;
 	list-style-type: none;
 	color: #5e5e5e;
 	word-break: break-all;
@@ -154,6 +172,14 @@ footer {
 	color: #5e5e5e;
 }
 
+.posts .external-url {
+	color: #00b4fa;
+	position: absolute;
+	top: 0;
+	right: 0;
+	padding: 15px;
+}
+
 /* Note metrics  */
 .notes {
 }
@@ -191,8 +217,8 @@ footer {
 
 .reply .author-replied-avatar {
 	vertical-align: middle;
-    display: table-cell;
-    padding-right: 10px;
+	display: table-cell;
+	padding-right: 10px;
 }
 
 .reply-content {
@@ -202,47 +228,49 @@ footer {
 	border-radius: 18px;
 	border: 1px solid #ebebeb;
 	padding: 9px 18px;
+	font-size: 14px;
+	color: #5e5e5e;
 }
 
 .reply-content .author {
+	color: black;
 	margin-bottom: 5px;
 }
 
 /* Annotation fields */
 .post-annotations {
-	background-color: white;
-    color: black;
-    border-radius: 8px;
-	border-top: 1px solid #ebebeb;
+	background-color: #7c5cff;
+	color: white;
+	border-radius:  0px 0px 8px 8px;
 }
 
 .post-annotation {
-    padding: 15px;
+	padding: 15px;
 }
 
 .post-annotation input {
-    border-radius: 5px;
+	border-radius: 5px;
 }
 
 .post-annotation > .annotation-label {
-    display: inline-block;
-    vertical-align: middle;
-    text-align: right;
-    min-width: 150px;
-    margin-right: 5px;
-    line-height: 1.6em;
-    overflow-x: hidden;
+	display: inline-block;
+	vertical-align: middle;
+	text-align: right;
+	min-width: 150px;
+	margin-right: 5px;
+	line-height: 1.6em;
+	overflow-x: hidden;
 }
 
 .post-annotation.checkbox > .post-annotation-options {
-    display: inline-block;
+	display: inline-block;
 }
 
 .post-annotation-options {
-    display: inline-block;
-    vertical-align: top;
+	display: inline-block;
+	vertical-align: top;
 }
 
 .post-annotation-options > input {
-    display: inline-block;
+	display: inline-block;
 }
\ No newline at end of file

From 4d19e5230f6342dfeed0e9961f6998284c565eaa Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Wed, 17 Jul 2024 17:49:35 +0200
Subject: [PATCH 094/204] Include post blocks in the right order in Tumblr
 Explorer Template

---
 .../explorer/datasource-templates/tumblr.html | 91 +++++++++++--------
 1 file changed, 53 insertions(+), 38 deletions(-)

diff --git a/webtool/templates/explorer/datasource-templates/tumblr.html b/webtool/templates/explorer/datasource-templates/tumblr.html
index fec6a42ed..09f9d06a2 100644
--- a/webtool/templates/explorer/datasource-templates/tumblr.html
+++ b/webtool/templates/explorer/datasource-templates/tumblr.html
@@ -20,23 +20,6 @@
 
 </header>
 
-<!-- MEDIA -->
-{% if post["image_urls"] or post["video_urls"] %}
-	<div class="media-container">
-	{% if post["image_urls"] %}
-		{% for image_url in post["image_urls"].split(",") %}
-			<img src="{{ image_url }}">
-		{% endfor %}
-	{% elif post["video_thumb_urls"] %}
-		{% for video_thumb_url in post["video_thumb_urls"].split(",") %}
-			{% if not pseudonymised %}<a href="{{ post['post_url'] }}" target='blank'>{% endif %}
-			<div class="video-thumb"><img src="{{ video_thumb_url }}"></div>
-			<div class="play-button"><i class="fa-solid fa-play"></i></div>
-			{% if not pseudonymised %}</a>{% endif %}
-		{% endfor %}
-	{% endif %}
-</div>
-{% endif %}
 
 <div class="post-content">
 	<!-- REBLOGGED CONTENT -->
@@ -53,26 +36,58 @@
 		{% endif %}
 	</div>
 	{% endif %}
-	<!-- EMBEDDED LINKS -->
-	{% if post.get("linked_urls") %}
-		{% for url in post["linked_urls"].split(",") %}
-		<div class="embedded-link"><a href="{{ url }}" target="_blank">{{ post["linked_urls_titles"].split("\n")[ loop.index - 1 ] }} <i class="fas fa-external-link-alt"></i></a></div>
-		{% endfor %}
-	{% endif %}
-	<!-- POLL -->
-	{% if post.get("poll_question") %}
-	<div class="poll">
-		<div class="poll-question">{{ post["poll_question"] }}</div>
-		<ul class="poll-answers">
-			{% for poll_answer in post["poll_answers"].split(",") %}
-				<li class="poll-answer">{{ poll_answer }}</li>
-			{% endfor %}
-		</ul>
 
-	</div>
-	{% endif %}
-	<!-- POST BODY -->
-	<div class="body">{{ post.get("body_markdown") | social_mediafy(datasource='tumblr') | safe }}</div>
+	<!-- CONTENT BLOCKS -->
+	<!-- Keep track of what blocks we've seen so we know what part of the strings to get -->
+	{% set block_counts = namespace({'text': 0, 'image': 0, 'video': 0, 'audio': 0, 'link': 0}) %}
+	{% for block in post.content_order.split(",") %}
+		<!-- TEXT -->
+		{% if block == "text" %}
+			<p class="body">{{ post.get("body_markdown").split("\n")[block_counts.text] | social_mediafy(datasource='tumblr') | safe }}</p>
+			{% set block_counts.text = block_counts.text + 1 %}
+		<!-- IMAGE -->
+		{% elif block == "image" %}
+			<div class="media-container image">
+				<img src="{{ post.image_urls.split(',')[block_counts.image] }}">
+			</div>
+			{% set block_counts.image = block_counts.image + 1 %}
+		<!-- VIDEO -->
+		{% elif block == "video" %}
+			<div class="media-container video">
+				<a href="{{ post.video_urls.split(',')[block_counts.video] }}" target='blank'>
+					<img src="{{ post.video_thumb_urls.split(',')[block_counts.video] }}">
+					<div class="play-button"><i class="fa-solid fa-play"></i></div>
+				</a>
+			</div>
+			{% set block_counts.video = block_counts.video + 1 %}
+		<!-- AUDIO -->
+		{% elif block == "audio" %}
+			<div class="media-container audio">
+				<a href="{{ post.audio_urls.split(',')[block_counts.audio] }}" target='blank'>
+					<div class="play-button"><i class="fa-solid fa-play"></i></div>
+				</a>
+			</div>
+			{% set block_counts.audio = block_counts.audio + 1 %}
+		<!-- EMBEDDED LINK -->
+		{% elif block == "link" %}
+			{% set url = post.linked_urls.split(",")[block_counts.link] %}
+			{% set url_title = post.linked_urls_titles.split(",")[block_counts.link] %}
+			<div class="embedded-link"><a href="{{ url }}" target="_blank">{{ url_title }} <i class="fas fa-external-link-alt"></i></a></div>
+			{% set block_counts.link = block_counts.link + 1 %}
+		<!-- POLL -->
+		{% elif block == "poll" %}
+			<!-- Only one poll can be added to posts, so no need to split -->
+			<div class="poll">
+				<div class="poll-question">{{ post["poll_question"] }}</div>
+				<ul class="poll-answers">
+					{% for poll_answer in post["poll_answers"].split(",") %}
+						<li class="poll-answer">{{ poll_answer }}</li>
+					{% endfor %}
+				</ul>
+			</div>
+		{% endif %}
+	{% endfor %}
+
 	<!-- TAGS -->
 	{% if post.get("tags") %}
 	<div class="tags">
@@ -88,7 +103,7 @@
 <footer>
 
 	<!-- DATE -->
-	<div class="time">Posted {{ post.timestamp | datetime(fmt="%d %b %Y, %H:%M", wrap=False) }} UTC</div>
+	<div class="time">{{ post.timestamp | datetime(fmt="%d %b %Y, %H:%M", wrap=False) }} UTC</div>
 
 	<!-- NOTES -->
 	{% if post.notes %} 
@@ -119,7 +134,7 @@
 				</div>
 				<div class="reply-content">
 					<div class="author author-replied">{% if not pseudonymised %}{{ author_replied }}{% else %}<i lass="fa fa-user-secret tooltip-trigger"></i>{% endif %}</div>
-					<div class="reply-text">{{ post.replies.split("\n\n")[ loop.index - 1 ].replace(author_replied + ": ", "") }}</div>
+					<div class="reply-text">{{ post.replies.split("\n\n")[ loop.index - 1 ].replace(author_replied + ": ", "") | social_mediafy(datasource='tumblr') | safe }}</div>
 				</div>
 			</li>
 			{% endfor %}

From d5d14e03249ebcf0c7d33f8fef118ab489c3be9a Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Wed, 17 Jul 2024 17:50:10 +0200
Subject: [PATCH 095/204] Get block orders and start changing how note
 retrieval works in Tumblr search

---
 datasources/tumblr/search_tumblr.py | 227 ++++++++++++++++------------
 1 file changed, 129 insertions(+), 98 deletions(-)

diff --git a/datasources/tumblr/search_tumblr.py b/datasources/tumblr/search_tumblr.py
index 9b2c89314..ee23e2f30 100644
--- a/datasources/tumblr/search_tumblr.py
+++ b/datasources/tumblr/search_tumblr.py
@@ -49,7 +49,7 @@ class SearchTumblr(Search):
 	seen_ids = set()
 	client = None
 	failed_notes = []
-	failed_reblogs = []
+	failed_posts = []
 
 	config = {
 		# Tumblr API keys to use for data capturing
@@ -219,31 +219,31 @@ def get_items(self, query):
 		# for each tag or blog, get post
 		for query in queries:
 
-				# Get posts per tag
-				if scope == "tag":
-					# Used for getting tagged posts, which uses requests instead.
-					api_key = self.parameters.get("consumer_key")
-					if not api_key:
-						api_key = SearchTumblr.get_tumblr_keys(self.owner)[0]
+			# Get posts per tag
+			if scope == "tag":
+				# Used for getting tagged posts, which uses requests instead.
+				api_key = self.parameters.get("consumer_key")
+				if not api_key:
+					api_key = SearchTumblr.get_tumblr_keys(self.owner)[0]
 
-					new_results = self.get_posts_by_tag(query, max_date=max_date, min_date=min_date, api_key=api_key)
+				new_results = self.get_posts_by_tag(query, max_date=max_date, min_date=min_date, api_key=api_key)
 
-				# Get posts per blog
-				elif scope == "blog":
-					new_results = self.get_posts_by_blog(query, max_date=max_date, min_date=min_date)
+			# Get posts per blog
+			elif scope == "blog":
+				new_results = self.get_posts_by_blog(query, max_date=max_date, min_date=min_date)
 
-				else:
-					self.dataset.update_status("Invalid scope")
-					break
+			else:
+				self.dataset.update_status("Invalid scope")
+				break
 
-				results += new_results
+			results += new_results
 
-				if self.max_posts_reached:
-					self.dataset.update_status("Max posts exceeded")
-					break
-				if self.api_limit_reached:
-					self.dataset.update_status("API limit reached")
-					break
+			if self.max_posts_reached:
+				self.dataset.update_status("Max posts exceeded")
+				break
+			if self.api_limit_reached:
+				self.dataset.update_status("API limit reached")
+				break
 
 		# Loop through the results once to add note data and fetch text reblogs,
 		len_results = len(results) # results will change in length when we add reblogs.
@@ -259,10 +259,10 @@ def get_items(self, query):
 				# Retrieving notes for tag-based posts must be done one-by-one;
 				# fetching them all at once is not supported by the Tumblr API.
 				if not "notes" in post:
-					self.dataset.update_status("Getting note data for post %i/%i" % (i, len_results))
+					self.dataset.update_status("Retrieving notes for post %i/%i" % (i, len_results))
 
-					# Prepare dicts to pass to `get_post_notes`
-					notes = self.get_post_notes(post["blog_name"], post["id"])
+					notes = self.get_notes(post["blog_name"], post["id"])
+					time.sleep(.2)
 
 					if notes:
 						results[i]["notes"] = notes
@@ -273,6 +273,7 @@ def get_items(self, query):
 							text_reblog = self.get_post_by_id(note["blog_name"], note["post_id"])
 							if text_reblog:
 								results.append(text_reblog)
+							time.sleep(.2)
 		
 		self.job.finish()
 		return results
@@ -334,7 +335,7 @@ def get_posts_by_tag(self, tag, max_date=None, min_date=None, api_key=None):
 				posts = response.json()["response"]
 				
 			except ConnectionError:
-				self.update_status("Encountered a connection error, waiting 10 seconds.")
+				self.update_status("Encountered a connection error, waiting 10 seconds")
 				time.sleep(10)
 				retries += 1
 				continue
@@ -382,11 +383,9 @@ def get_posts_by_tag(self, tag, max_date=None, min_date=None, api_key=None):
 
 				if date_retries < 96:
 					max_date -= 21600 # Decrease by six hours
-					self.dataset.update_status("Collected %s posts for tag %s, but no new posts returned - decreasing time search with 6 hours to %s to make sure this is really it (retry %s/96)" % (str(len(all_posts)), tag, max_date_str, str(date_retries),))
 				elif date_retries <= self.max_date_retries:
 					max_date -= 604800 # Decrease by one week
-					retry_str = str(date_retries - 96)
-					self.dataset.update_status("Collected %s posts for tag %s, but no new posts returned - no new posts found with decreasing by 6 hours, decreasing with a week to %s instead (retry %s/150)" % (str(len(all_posts)), tag, max_date_str, str(retry_str),))
+					self.dataset.update_status("No new posts found for #%s - looking for posts before %s" % (tag, datetime.fromtimestamp(max_date).strftime("%Y-%m-%d %H:%M:%S")))
 
 				# We can stop when the max date drops below the min date.
 				if min_date:
@@ -481,7 +480,8 @@ def get_posts_by_tag(self, tag, max_date=None, min_date=None, api_key=None):
 				self.max_posts_reached = True
 				break
 
-			self.dataset.update_status("Collected %s posts for tag %s, now looking for posts before %s" % (str(len(all_posts)), tag, max_date_str,))
+			self.dataset.update_status("Collected %s posts for #%s, retrieving posts before %s" % (str(len(all_posts)), tag, max_date_str,))
+			time.sleep(.2)
 
 		return all_posts
 
@@ -522,7 +522,6 @@ def get_posts_by_blog(self, blog, max_date=None, min_date=None):
 				posts = posts["posts"]
 
 			except Exception as e:
-
 				self.dataset.update_status("Reached the limit of the Tumblr API. Last timestamp: %s" % str(max_date))
 				self.api_limit_reached = True
 				break
@@ -560,11 +559,54 @@ def get_posts_by_blog(self, blog, max_date=None, min_date=None):
 				self.max_posts_reached = True
 				break
 
-			self.dataset.update_status("Collected %s posts" % str(len(all_posts)))
+			self.dataset.update_status("Collected %s posts for blog %s" % str(len(all_posts), blog))
+			time.sleep(.2)
 
 		return all_posts
 
-	def get_post_notes(self, blog_id, post_id):
+	def get_post_by_id(self, blog_name, post_id):
+		"""
+		Fetch individual posts
+		:param blog_name, str: The blog's name
+		:param id, int: The post ID
+
+		returns result list, a list with a dictionary with the post's information
+		"""
+		if self.interrupted:
+			raise ProcessorInterruptedException("Interrupted while fetching post from Tumblr")
+
+		connection_retries = 0
+
+		while True:
+			if connection_retries >= 5:
+				self.dataset.update_status("Too many connection errors; unable to collect post %s" % post_id)
+				break
+			try:
+				# Request the specific post.
+				post = self.client.posts(blog_name, id=post_id, npf=True, reblog_info=True, notes_info=True, filter="raw")
+		
+			except ConnectionRefusedError:
+				connection_retries += 1
+				self.failed_posts.append(note["id"])
+				self.dataset.update_status("ConnectionRefused: Unable to collect reblogs for post %s" % post_id)
+				time.sleep(10)
+				continue
+			
+			if post:
+				break
+			time.sleep(.2)
+
+		# Tumblr API can sometimes return with this kind of error:
+		# {'meta': {'status': 500, 'msg': 'Server Error'}, 'response': {'error': 'Malformed JSON or HTML was returned.'}}
+		if not post or "posts" not in post:
+			return None
+
+		# Get the first element of the list - it's always one post.
+		result = post["posts"][0]
+
+		return result
+
+	def get_notes(self, blog_id, post_id):
 		"""
 		Gets data on the notes of a specific post.
 		:param blog_id, str: The ID of the blog.
@@ -579,6 +621,9 @@ def get_post_notes(self, blog_id, post_id):
 		# Do some counting
 		count = 0
 
+		# Some posts have tens of thousands of notes
+		# so we'll cap this at 100
+
 		# Stop trying to fetch the notes after this many retries
 		max_notes_retries = 10
 		notes_retries = 0
@@ -590,9 +635,27 @@ def get_post_notes(self, blog_id, post_id):
 
 		while True:
 
+			if notes_retries >= max_notes_retries:
+				self.dataset.update_status("Too many connection errors; unable to collect notes for post %s" % post_id)
+				self.failed_posts.append(post_id)
+				break
+
 			# Requests a post's notes
-			notes = self.client.notes(blog_id, id=post_id, before_timestamp=max_date)
-			
+			try:
+				notes = self.client.notes(blog_id, id=post_id, before_timestamp=max_date)
+				print(notes)
+			except ConnectionRefusedError:
+				self.dataset.update_status("Couldn't get notes for post %s (ConnectionRefusedError), trying again" % post_id)
+				notes_retries += 1
+				time.sleep(10)
+				continue
+
+			except Exception as e:
+				# Stop with unknown errors
+				self.dataset.update_status("Couldn't get notes for post %s. Unknown error: %s" % (post_id, e))
+				notes_retries += 1
+				break
+
 			if "notes" in notes:
 				notes_retries = 0
 
@@ -600,7 +663,9 @@ def get_post_notes(self, blog_id, post_id):
 					post_notes.append(note)
 
 				if notes.get("_links"):
+					print("more notes for " + str(blog_id) + " " + str(post_id))
 					max_date = notes["_links"]["next"]["query_params"]["before_timestamp"]
+					time.sleep(.2)
 
 				# If there's no `_links` key, that's all.
 				else:
@@ -608,55 +673,11 @@ def get_post_notes(self, blog_id, post_id):
 
 			# If there's no "notes" key in the returned dict, something might be up
 			else:
-				self.dataset.update_status("Couldn't get notes for Tumblr post " + str(post_id))
 				notes_retries += 1
-				pass
-
-			if notes_retries > max_notes_retries:
-				self.failed_notes.append(post_id)
-				break
-
-		return post_notes
-
-	def get_post_by_id(self, blog_name, post_id):
-		"""
-		Fetch individual posts
-		:param blog_name, str: The blog's name
-		:param id, int: The post ID
-
-		returns result list, a list with a dictionary with the post's information
-		"""
-		if self.interrupted:
-			raise ProcessorInterruptedException("Interrupted while fetching post from Tumblr")
-
-		connection_retries = 0
-
-		while True:
-			if connection_retries >= 5:
-				self.dataset.update_status("Multiple connection refused errors; unable to continue collection of reblogs.")
-				break
-			try:
-				# Request the specific post.
-				post = self.client.posts(blog_name, id=post_id, npf=True)
-		
-			except ConnectionRefusedError:
-				connection_retries += 1
-				self.failed_reblogs.append(note["id"])
-				self.dataset.update_status(f"ConnectionRefused: Unable to collect reblogs for post " + note["id"])
+				time.sleep(1)
 				continue
-			
-			if post:
-				break
 
-		# Tumblr API can sometimes return with this kind of error:
-		# {'meta': {'status': 500, 'msg': 'Server Error'}, 'response': {'error': 'Malformed JSON or HTML was returned.'}}
-		if not post or "posts" not in post:
-			return None
-
-		# Get the first element of the list - it's always one post.
-		result = post["posts"][0]
-
-		return result
+		return post_notes
 
 	@staticmethod
 	def get_tumblr_keys(user):
@@ -762,8 +783,9 @@ def map_item(post):
 		linked_titles = []
 		question = ""
 		answers = ""
-		raw_text = ""
+		raw_text = []
 		formatted_text = []
+		content_order = []	# To retain the order in which post blocks appear
 		authors_liked = []
 		authors_reblogged = []
 		authors_replied = []
@@ -792,6 +814,8 @@ def map_item(post):
 				video_urls.append(block["media"]["url"])
 				if "filmstrip" in block:
 					video_thumb_urls.append(block["filmstrip"]["url"])
+				elif "poster" in block:
+					video_thumb_urls.append(block["poster"][0]["url"])				
 			elif block_type == "link":
 				linked_urls.append(block["url"])
 				if "title" in block:
@@ -800,8 +824,9 @@ def map_item(post):
 					raw_text += block["description"] + "\n"
 					formatted_text.append(block["description"])
 			elif block_type == "poll":
-				question += block["question"]
-				answers = [a["answer_text"] for a in block["answers"]]
+				# Only one poll can be added per post
+				question = block["question"]
+				answers = ",".join([a["answer_text"] for a in block["answers"]])
 
 			# We're gonna add some formatting to the text
 			# Skip text that is part of a reblogged post.
@@ -830,11 +855,16 @@ def map_item(post):
 				if block.get("subtype") == "unordered-list-item":
 					text = "- " + text
 
-				raw_text += block["text"] + "\n"
+				raw_text.append(block["text"])
 				formatted_text.append(text)
 
 			elif block_type == "text" and i in reblogged_text_blocks:
 				body_reblogged.append(block["text"])
+				# Reblogged text is not considered as an ordered post block,
+				# as it is always put first.
+				continue
+
+			content_order.append(block_type)
 
 		# Add note data
 		for note in post.get("notes", []):
@@ -865,11 +895,12 @@ def map_item(post):
 			"post_url": post["post_url"],
 			"post_slug": post["slug"],
 			"is_reblog": True if post.get("original_type") == "note" else "",
-			"body": raw_text,
+			"body": "\n".join(raw_text),
 			"body_markdown": "\n".join(formatted_text),
-			"body_reblogged": "\n".join(body_reblogged) if body_reblogged else "",
+			"body_reblogged": "\n".join(body_reblogged),
+			"content_order": ",".join(content_order),
 			"author_reblogged": author_reblogged,
-			"tags": ",".join(post["tags"]) if post.get("tags") else "",
+			"tags": ",".join(post.get("tags", "")),
 			"notes": post["note_count"],
 			"like_count": len(authors_liked),
 			"authors_liked": ",".join(authors_liked),
@@ -878,15 +909,15 @@ def map_item(post):
 			"reply_count": len(authors_replied),
 			"authors_replied": ",".join(authors_replied),
 			"replies": "\n\n".join(replies),
-			"linked_urls": ",".join(linked_urls) if linked_urls else "",
-			"linked_urls_titles": "\n".join(linked_titles) if linked_titles else "",
-			"image_urls": ",".join(image_urls) if image_urls else "",
-			"video_urls": ",".join(video_urls) if video_urls else "",
-			"video_thumb_urls": ",".join(video_thumb_urls) if video_thumb_urls else "",
-			"audio_urls": ",".join(audio_urls) if audio_urls else "",
-			"audio_artist": ",".join(audio_artists) if audio_artists else "",
+			"linked_urls": ",".join(linked_urls),
+			"linked_urls_titles": "\n".join(linked_titles),
+			"image_urls": ",".join(image_urls),
+			"video_urls": ",".join(video_urls),
+			"video_thumb_urls": ",".join(video_thumb_urls),
+			"audio_urls": ",".join(audio_urls),
+			"audio_artist": ",".join(audio_artists),
 			"poll_question": question,
-			"poll_answers": ",".join(answers)
+			"poll_answers": answers
 		})
 
 	def after_process(self):
@@ -900,8 +931,8 @@ def after_process(self):
 		errors = []
 		if len(self.failed_notes) > 0:
 			errors.append("API error(s) when fetching notes %s" % ", ".join(self.failed_notes))
-		if len(self.failed_reblogs) > 0:
-			errors.append("API error(s) when fetching reblogs %s" % ", ".join(self.failed_reblogs))
+		if len(self.failed_posts) > 0:
+			errors.append("API error(s) when fetching reblogs %s" % ", ".join(self.failed_posts))
 		if errors:
 			self.dataset.log(";\n ".join(errors))
-			self.dataset.update_status(f"Dataset completed but failed to capture some notes/reblogs; see log for details.")
+			self.dataset.update_status(f"Dataset completed but failed to capture some notes/reblogs; see log for details")

From 8e885f013b2bf3e22cc29d61a7ded726ed5aebca Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Wed, 17 Jul 2024 23:09:44 +0200
Subject: [PATCH 096/204] Fix Markdown, include audio and video, and follow
 correct block order in Tumblr Template

---
 datasources/tumblr/search_tumblr.py | 106 +++++++++++++++++++---------
 1 file changed, 73 insertions(+), 33 deletions(-)

diff --git a/datasources/tumblr/search_tumblr.py b/datasources/tumblr/search_tumblr.py
index ee23e2f30..536981a1f 100644
--- a/datasources/tumblr/search_tumblr.py
+++ b/datasources/tumblr/search_tumblr.py
@@ -7,6 +7,7 @@
 import time
 import pytumblr
 import requests
+import re
 from requests.exceptions import ConnectionError
 from datetime import datetime
 from ural import urls_from_text
@@ -765,22 +766,20 @@ def map_item(post):
 		Tumblr posts can be many different types, so some data processing is necessary.
 
 		:param posts, list:		List of Tumblr posts as returned form the Tumblr API.
-		:param reblog, bool:	Whether the post concerns a reblog of posts from the original dataset.
 	
 		:return dict:			Mapped item 
 		"""
 
 		media_types = ["photo", "video", "audio"]
 
-		# We're getting info as Neue Post Format types,
-		# so we need to loop through some 'blocks'.
 		image_urls = []
 		video_urls = []
 		video_thumb_urls = []
 		audio_urls = []
 		audio_artists = []
-		linked_urls = []
-		linked_titles = []
+		link_urls = []
+		link_titles = []
+		link_descriptions = []
 		question = ""
 		answers = ""
 		raw_text = []
@@ -801,57 +800,88 @@ def map_item(post):
 				reblogged_text_blocks += layout_block["blocks"]
 				author_reblogged = layout_block["attribution"]["blog"]["name"]
 
-		# Loop through "blocks"
+		# We're getting info as Neue Post Format types,
+		# so we need to loop through and join some content 'blocks'.
 		for i, block in enumerate(post.get("content", [])):
+			
 			block_type = block["type"]
 
+			# Image
 			if block_type == "image":
 				image_urls.append(block["media"][0]["url"])
+			# Audio file
 			elif block_type == "audio":
-				audio_urls.append(block["media"]["url"])
-				audio_artists.append(block["artist"])
+				audio_urls.append(block["url"] if "url" in block else block["media"]["url"])
+				audio_artists.append(block.get("artist", ""))
+			# Video (embedded or hosted)
 			elif block_type == "video":
-				video_urls.append(block["media"]["url"])
+				if "media" in block:
+					video_urls.append(block["media"]["url"])
+				elif "url" in block:
+					video_urls.append(block["url"])
 				if "filmstrip" in block:
 					video_thumb_urls.append(block["filmstrip"]["url"])
 				elif "poster" in block:
-					video_thumb_urls.append(block["poster"][0]["url"])				
+					video_thumb_urls.append(block["poster"][0]["url"])
+				else:
+					video_thumb_urls.append("")
+			# Embedded link
 			elif block_type == "link":
-				linked_urls.append(block["url"])
+				link_urls.append(block["url"])
 				if "title" in block:
-					linked_titles.append(block["title"])
+					link_titles.append(block["title"])
 				if "description" in block:
-					raw_text += block["description"] + "\n"
-					formatted_text.append(block["description"])
+					link_descriptions.append(block["description"])
+			# Poll
 			elif block_type == "poll":
 				# Only one poll can be added per post
 				question = block["question"]
 				answers = ",".join([a["answer_text"] for a in block["answers"]])
 
-			# We're gonna add some formatting to the text
-			# Skip text that is part of a reblogged post.
+			# Text
+			# Here we're adding Markdown formatting.
+			# We skip text that is part of a reblogged post.
 			elif block_type == "text" and i not in reblogged_text_blocks:
 
 				text = block["text"]
 
-				extra_chars = 0
 				if block.get("formatting"):
+
+					# Dict with index numbers as keys where inserts need to be made,
+					# and the replacement strings as values. Done this way so we know
+					# when multiple formatting operations need to be made at the same
+					# index position.
+					insert_indexes = set()
+					inserts = {}
+
 					for fmt in block["formatting"]:
-						
 						fmt_type = fmt["type"]
-						s = fmt["start"] + extra_chars	# Start of formatted substring
-						e = fmt["end"] + extra_chars	# End of formatted substring
-
-						if fmt_type == "link":
-							text = text[:s] + "[" + text[s:e] + "](" + fmt["formatting"]["url"] + ")" + text[e:]
-							extra_chars += 4 + len(fmt["formatting"]["url"])
-						elif fmt_type == "italic":
-							text = text[:s] + "*" + text[s:e] + "*" + text[e:]
-							extra_chars += 2
-						elif fmt_type == "bold":
-							text = text[:s] + "**" + text[s:e] + "**" + text[e:]
-							extra_chars += 4
-
+						if fmt["type"] in ("link", "bold", "italic"):
+							s = fmt["start"]
+							e = fmt["end"]
+
+							opening = True # So we know if the styles need to be appended or prepended
+							for i in [s, e]:
+								insert_indexes.add(i)
+								i = str(i)
+								if i not in inserts:
+									inserts[i] = ""
+								if fmt_type == "link" and opening:
+									inserts[i] = inserts[i] + "["
+								elif fmt_type == "link" and not opening:
+									inserts[i] = "](" + fmt["url"] + ")" + inserts[i]
+								elif fmt_type == "italic":
+									inserts[i] = "*" + inserts[i] if opening else inserts[i] + "*"
+								elif fmt_type == "bold":
+									inserts[i] = "**" + inserts[i] if opening else inserts[i] + "**"
+								opening = False
+					if inserts:
+						extra_chars = 0
+						for i, insert in inserts.items():
+							i = int(i) + extra_chars
+							text = text[:i] + insert + text[i:]
+							extra_chars += len(insert)
+					
 				if block.get("subtype") == "unordered-list-item":
 					text = "- " + text
 
@@ -866,6 +896,15 @@ def map_item(post):
 
 			content_order.append(block_type)
 
+		# Sometimes the order is reshuffled in the `layout` property...
+		if post.get("layout"):
+			if "type" in post["layout"][0]:
+				if post["layout"][0]["type"] == "rows":
+					new_content_order = []
+					for i in post["layout"][0].get("display", []):
+						new_content_order.append(content_order[i["blocks"][0]])
+					content_order = new_content_order
+
 		# Add note data
 		for note in post.get("notes", []):
 			if note["type"] == "like":
@@ -909,8 +948,9 @@ def map_item(post):
 			"reply_count": len(authors_replied),
 			"authors_replied": ",".join(authors_replied),
 			"replies": "\n\n".join(replies),
-			"linked_urls": ",".join(linked_urls),
-			"linked_urls_titles": "\n".join(linked_titles),
+			"link_urls": ",".join(link_urls),
+			"link_titles": "\n".join(link_titles),
+			"link_descriptions": "\n".join(link_descriptions),
 			"image_urls": ",".join(image_urls),
 			"video_urls": ",".join(video_urls),
 			"video_thumb_urls": ",".join(video_thumb_urls),

From c7fa5fa1a7cece8a778aa2661be0c26efea22e0d Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Wed, 17 Jul 2024 23:10:07 +0200
Subject: [PATCH 097/204] Skip URLs in social mediafy template filter if it's
 already markdown

---
 webtool/lib/template_filters.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/webtool/lib/template_filters.py b/webtool/lib/template_filters.py
index 3bc75ebb9..f04546d1a 100644
--- a/webtool/lib/template_filters.py
+++ b/webtool/lib/template_filters.py
@@ -202,13 +202,15 @@ def _jinja2_filter_social_mediafy(body, datasource=""):
 		},
 		"tumblr": {
 			"hashtag": "https://tumblr.com/tagged/",
-			"mention": "https://tumblr.com/"
+			"mention": "https://tumblr.com/",
+			"markdown": True
 		},
 		"linkedin": {
 			"hashtag": "https://linkedin.com/feed/hashtag/?keywords=",
 			"mention": "https://linkedin.com/in/"
 		},
 		"telegram": {
+		"markdown": True
 		}
 	}
 
@@ -218,12 +220,12 @@ def _jinja2_filter_social_mediafy(body, datasource=""):
 		return body
 
 	# Add URL links
-	if datasource != "telegram": # Telegram has mardown links
+	if not base_urls[datasource].get("markdown"):
 		for url in urls_from_text(body):
 			body = re.sub(url, "<a href='%s' target='_blank'>%s</a>" % (url, url), body)
 
 	# Add hashtag links
-	if "hasthag"  in base_urls[datasource]:
+	if "hashtag"  in base_urls[datasource]:
 		tags = re.findall(r"#[\w0-9]+", body)
 		# We're sorting tags by length so we don't incorrectly
 		# replace tags that are a substring of another, longer tag.

From 281bf568e06d27a0898c25bdee383b9b241bd6f1 Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Wed, 17 Jul 2024 23:10:20 +0200
Subject: [PATCH 098/204] add markdown

---
 webtool/static/css/explorer/tumblr.css | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/webtool/static/css/explorer/tumblr.css b/webtool/static/css/explorer/tumblr.css
index cc33bc29d..e1299086b 100644
--- a/webtool/static/css/explorer/tumblr.css
+++ b/webtool/static/css/explorer/tumblr.css
@@ -72,6 +72,14 @@ header .author {
 	filter: blur(1.5rem);
 }
 
+.media-container.audio {
+	margin: initial;
+}
+
+.media-container.audio audio {
+	width: 100%;
+}
+
 .play-button {
 	position: absolute;
 	width: 100%;
@@ -130,6 +138,11 @@ header .author {
 	color: white;
 }
 
+.embedded-link .link-description {
+	margin-top: 3px;
+	font-size: 14px;
+}
+
 .poll-question {
 	font-size: 20px;
 	padding: 3px 0px 3px 0px;

From b65ad4393eb49ee9536252145b6dd9e11070027c Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Wed, 17 Jul 2024 23:10:28 +0200
Subject: [PATCH 099/204] Typo in pagination

---
 webtool/templates/explorer/pagination.html | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/webtool/templates/explorer/pagination.html b/webtool/templates/explorer/pagination.html
index 2161f22bd..1dbb4f05e 100644
--- a/webtool/templates/explorer/pagination.html
+++ b/webtool/templates/explorer/pagination.html
@@ -50,7 +50,7 @@
 		{# Show upper 'edge' pages #}
 		{% elif upper_bound - 2 <= page <= upper_bound %}
 			{% for i in range(page - 1, upper_bound) %}
-			<li class="page">{% if page == current_page %}<strong class="current">{{ page }}</strong>{% else %}"><a href="/results/{{key}}/explorer/page/{{ i - 1 }}{{ parameters }}">{{ i - 1 }}</a>{% endif %}</li>
+			<li class="page">{% if page == current_page %}<strong class="current">{{ page }}</strong>{% else %}<a href="/results/{{key}}/explorer/page/{{ i - 1 }}{{ parameters }}">{{ i - 1 }}</a>{% endif %}</li>
 			{% endfor %}
 		{% endif %}
 

From e1d25da3f9ed9bb9f3314381c5a505eb8edc9292 Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Wed, 17 Jul 2024 23:10:47 +0200
Subject: [PATCH 100/204] Add video to Tumblr Template

---
 .../explorer/datasource-templates/tumblr.html | 29 ++++++++++---------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/webtool/templates/explorer/datasource-templates/tumblr.html b/webtool/templates/explorer/datasource-templates/tumblr.html
index 09f9d06a2..bfec84253 100644
--- a/webtool/templates/explorer/datasource-templates/tumblr.html
+++ b/webtool/templates/explorer/datasource-templates/tumblr.html
@@ -41,18 +41,18 @@
 	<!-- Keep track of what blocks we've seen so we know what part of the strings to get -->
 	{% set block_counts = namespace({'text': 0, 'image': 0, 'video': 0, 'audio': 0, 'link': 0}) %}
 	{% for block in post.content_order.split(",") %}
-		<!-- TEXT -->
 		{% if block == "text" %}
-			<p class="body">{{ post.get("body_markdown").split("\n")[block_counts.text] | social_mediafy(datasource='tumblr') | safe }}</p>
+		<!-- TEXT -->
+			<p class="body">{{ post.get("body_markdown").split("\n")[block_counts.text] | markdown | social_mediafy(datasource='tumblr') | safe }}</p>
 			{% set block_counts.text = block_counts.text + 1 %}
-		<!-- IMAGE -->
 		{% elif block == "image" %}
+		<!-- IMAGE -->
 			<div class="media-container image">
 				<img src="{{ post.image_urls.split(',')[block_counts.image] }}">
 			</div>
 			{% set block_counts.image = block_counts.image + 1 %}
-		<!-- VIDEO -->
 		{% elif block == "video" %}
+		<!-- VIDEO -->
 			<div class="media-container video">
 				<a href="{{ post.video_urls.split(',')[block_counts.video] }}" target='blank'>
 					<img src="{{ post.video_thumb_urls.split(',')[block_counts.video] }}">
@@ -60,22 +60,24 @@
 				</a>
 			</div>
 			{% set block_counts.video = block_counts.video + 1 %}
-		<!-- AUDIO -->
 		{% elif block == "audio" %}
+		<!-- AUDIO -->
 			<div class="media-container audio">
-				<a href="{{ post.audio_urls.split(',')[block_counts.audio] }}" target='blank'>
-					<div class="play-button"><i class="fa-solid fa-play"></i></div>
-				</a>
+				<audio controls src="{{ post.audio_urls.split(',')[block_counts.audio] }}"></audio>
 			</div>
 			{% set block_counts.audio = block_counts.audio + 1 %}
-		<!-- EMBEDDED LINK -->
 		{% elif block == "link" %}
-			{% set url = post.linked_urls.split(",")[block_counts.link] %}
-			{% set url_title = post.linked_urls_titles.split(",")[block_counts.link] %}
-			<div class="embedded-link"><a href="{{ url }}" target="_blank">{{ url_title }} <i class="fas fa-external-link-alt"></i></a></div>
+		<!-- EMBEDDED LINK -->
+			{% set url = post.link_urls.split(",")[block_counts.link] %}
+			{% set link_title = post.link_titles.split(",")[block_counts.link] %}
+			{% set link_description = post.link_descriptions.split(",")[block_counts.link] %}
+			<div class="embedded-link">
+				<a href="{{ url }}" target="_blank">{{ link_title }}</a>
+				<div class="link-description">{{ link_description }} <i class="fas fa-external-link-alt"></i></div>
+			</div>
 			{% set block_counts.link = block_counts.link + 1 %}
-		<!-- POLL -->
 		{% elif block == "poll" %}
+		<!-- POLL -->
 			<!-- Only one poll can be added to posts, so no need to split -->
 			<div class="poll">
 				<div class="poll-question">{{ post["poll_question"] }}</div>
@@ -142,5 +144,4 @@
 		{% endif %}
 	</div>
 	{% endif %}
-
 </footer>
\ No newline at end of file

From 70e0c9bcecaec67b5268808690ab7502f85ec2c2 Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Wed, 17 Jul 2024 23:11:10 +0200
Subject: [PATCH 101/204] Be more honest with errors

---
 webtool/views/views_explorer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/webtool/views/views_explorer.py b/webtool/views/views_explorer.py
index 61fb467e6..18d657387 100644
--- a/webtool/views/views_explorer.py
+++ b/webtool/views/views_explorer.py
@@ -118,7 +118,7 @@ def explorer_dataset(key, page=1):
 				break
 
 	if not posts:
-		return error(404, error="No posts available for this datasource")
+		return error(404, error="No posts or posts could not be displayed")
 
 	# We can use either a generic or a pre-made data source-specific template.
 	template = "datasource" if has_datasource_template(datasource) else "generic"

From 95d03f49c05d2f865128aa98b7ec57f2c14f5fb9 Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Tue, 23 Jul 2024 17:42:29 +0200
Subject: [PATCH 102/204] Add more layout options for Tumblr

---
 datasources/tumblr/search_tumblr.py | 30 +++++++++++++++++++++++++----
 1 file changed, 26 insertions(+), 4 deletions(-)

diff --git a/datasources/tumblr/search_tumblr.py b/datasources/tumblr/search_tumblr.py
index 536981a1f..3ab0cd171 100644
--- a/datasources/tumblr/search_tumblr.py
+++ b/datasources/tumblr/search_tumblr.py
@@ -2,6 +2,10 @@
 Search Tumblr via its API
 
 Can fetch posts from specific blogs or with specific hashtags
+
+For Tumblr API documentation, see https://www.tumblr.com/docs/en/api/v2
+For Neue Post Format documentation, see https://github.com/tumblr/docs/blob/master/npf-spec.md
+
 """
 
 import time
@@ -790,6 +794,9 @@ def map_item(post):
 		authors_replied = []
 		replies = []
 
+		# Keep track of list order
+		list_order = 1
+
 		# Keep track if blocks belong to another post,
 		# which is stored in `layout`.
 		body_reblogged = []
@@ -881,9 +888,23 @@ def map_item(post):
 							i = int(i) + extra_chars
 							text = text[:i] + insert + text[i:]
 							extra_chars += len(insert)
-					
-				if block.get("subtype") == "unordered-list-item":
-					text = "- " + text
+				
+				# Some more 'subtype' formatting
+				subtype = block.get("subtype")
+				if subtype:
+					if subtype == "unordered-list-item":
+						text = "- " + text
+					if subtype == "ordered-list-item":
+						text = list_order + ". " + text
+						list_order += 1
+					elif subtype == "heading1":
+						text = "#" + 
+					elif subtype == "heading2":
+						text = "##" + text
+					elif subtype == "quote":
+						text = ">" + text
+					elif subtype == "indented":
+						text = "  " + text
 
 				raw_text.append(block["text"])
 				formatted_text.append(text)
@@ -896,7 +917,8 @@ def map_item(post):
 
 			content_order.append(block_type)
 
-		# Sometimes the order is reshuffled in the `layout` property...
+		# Sometimes the order is reshuffled in the `layout` property,
+		# so we have to correct this.
 		if post.get("layout"):
 			if "type" in post["layout"][0]:
 				if post["layout"][0]["type"] == "rows":

From 451f2bb5c1497c3d1f18c53778bc0a5373b7dada Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Wed, 24 Jul 2024 12:49:20 +0200
Subject: [PATCH 103/204] No post reshuffling after the fact

---
 datasources/tumblr/search_tumblr.py | 42 ++++++++++++++---------------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/datasources/tumblr/search_tumblr.py b/datasources/tumblr/search_tumblr.py
index 3ab0cd171..2933279df 100644
--- a/datasources/tumblr/search_tumblr.py
+++ b/datasources/tumblr/search_tumblr.py
@@ -775,7 +775,6 @@ def map_item(post):
 		"""
 
 		media_types = ["photo", "video", "audio"]
-
 		image_urls = []
 		video_urls = []
 		video_thumb_urls = []
@@ -788,15 +787,11 @@ def map_item(post):
 		answers = ""
 		raw_text = []
 		formatted_text = []
-		content_order = []	# To retain the order in which post blocks appear
 		authors_liked = []
 		authors_reblogged = []
 		authors_replied = []
 		replies = []
 
-		# Keep track of list order
-		list_order = 1
-
 		# Keep track if blocks belong to another post,
 		# which is stored in `layout`.
 		body_reblogged = []
@@ -807,10 +802,25 @@ def map_item(post):
 				reblogged_text_blocks += layout_block["blocks"]
 				author_reblogged = layout_block["attribution"]["blog"]["name"]
 
+		ordered_list_count = 1
+
+		# Sometimes the content order is reshuffled in the `layout` property,
+		# so we have to follow this.
+		content_order = []
+		blocks = []
+		if post.get("layout"):
+			if "type" in post["layout"][0]:
+				if post["layout"][0]["type"] == "rows":
+					for display in post["layout"][0].get("display", []):
+						content_order.append(display["blocks"][0])
+		if not content_order:
+			content_order = range(len(post["content"]))
+
 		# We're getting info as Neue Post Format types,
 		# so we need to loop through and join some content 'blocks'.
-		for i, block in enumerate(post.get("content", [])):
+		for i in content_order:
 			
+			block = post["content"][i]
 			block_type = block["type"]
 
 			# Image
@@ -895,10 +905,10 @@ def map_item(post):
 					if subtype == "unordered-list-item":
 						text = "- " + text
 					if subtype == "ordered-list-item":
-						text = list_order + ". " + text
-						list_order += 1
+						text = str(ordered_list_count) + ". " + text
+						ordered_list_count += 1
 					elif subtype == "heading1":
-						text = "#" + 
+						text = "#" + text
 					elif subtype == "heading2":
 						text = "##" + text
 					elif subtype == "quote":
@@ -915,17 +925,7 @@ def map_item(post):
 				# as it is always put first.
 				continue
 
-			content_order.append(block_type)
-
-		# Sometimes the order is reshuffled in the `layout` property,
-		# so we have to correct this.
-		if post.get("layout"):
-			if "type" in post["layout"][0]:
-				if post["layout"][0]["type"] == "rows":
-					new_content_order = []
-					for i in post["layout"][0].get("display", []):
-						new_content_order.append(content_order[i["blocks"][0]])
-					content_order = new_content_order
+			blocks.append(block_type)
 
 		# Add note data
 		for note in post.get("notes", []):
@@ -959,7 +959,7 @@ def map_item(post):
 			"body": "\n".join(raw_text),
 			"body_markdown": "\n".join(formatted_text),
 			"body_reblogged": "\n".join(body_reblogged),
-			"content_order": ",".join(content_order),
+			"content_order": ",".join(blocks),
 			"author_reblogged": author_reblogged,
 			"tags": ",".join(post.get("tags", "")),
 			"notes": post["note_count"],

From 996512d15c5d195c75999a748408b31b0eff5ec5 Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Wed, 24 Jul 2024 14:06:26 +0200
Subject: [PATCH 104/204] Skip duplicate posts in a better way

---
 datasources/tumblr/search_tumblr.py | 25 ++++++++++---------------
 1 file changed, 10 insertions(+), 15 deletions(-)

diff --git a/datasources/tumblr/search_tumblr.py b/datasources/tumblr/search_tumblr.py
index 2933279df..7604f7d46 100644
--- a/datasources/tumblr/search_tumblr.py
+++ b/datasources/tumblr/search_tumblr.py
@@ -345,22 +345,23 @@ def get_posts_by_tag(self, tag, max_date=None, min_date=None, api_key=None):
 				retries += 1
 				continue
 
-			# Get rid of posts that we already enountered,
+			# Skip posts that we already enountered,
 			# preventing Tumblr API shenanigans or double posts because of
-			# time reductions. Make sure it's no odd error string, though.
-			unseen_posts = []
-			for check_post in posts:
+			# time reductions. Make sure it's no error string, though.
+			new_posts = []
+			for post in posts:
 				# Sometimes the API repsonds just with "meta", "response", or "errors".
-				if isinstance(check_post, str):
-					self.dataset.update_status("Couldn't add post:", check_post)
+				if isinstance(post, str):
+					self.dataset.update_status("Couldn't add post:", post)
 					retries += 1
 					break
 				else:
 					retries = 0
-					if check_post["id"] not in self.seen_ids:
-						unseen_posts.append(check_post)
+					if post["id"] not in self.seen_ids:
+						self.seen_ids.add(post["id"])
+						new_posts.append(post)
 
-			posts = unseen_posts
+			posts = new_posts
 
 			# For no clear reason, the Tumblr API sometimes provides posts with a higher timestamp than requested.
 			# So we have to prevent this manually.
@@ -431,8 +432,6 @@ def get_posts_by_tag(self, tag, max_date=None, min_date=None, api_key=None):
 
 						time_str = datetime.fromtimestamp(date).strftime("%Y-%m-%d %H:%M:%S")
 						self.dataset.update_status("Time difference of %s spotted, restarting query at %s" % (str(time_dif), time_str,))
-
-						self.seen_ids.update([post["id"] for post in posts])
 						posts = [post for post in posts if post["timestamp"] >= date]
 						if posts:
 							all_posts += posts
@@ -456,7 +455,6 @@ def get_posts_by_tag(self, tag, max_date=None, min_date=None, api_key=None):
 
 						if posts:
 							all_posts += posts
-							self.seen_ids.update([post["id"] for post in posts])
 						break
 
 				# We got a new post, so we can reset the retry counts.
@@ -466,9 +464,6 @@ def get_posts_by_tag(self, tag, max_date=None, min_date=None, api_key=None):
 				# Add retrieved posts top the main list
 				all_posts += posts
 
-				# Add to seen ids
-				self.seen_ids.update([post["id"] for post in posts])
-
 				# Add time differences and calculate new average time difference
 				all_time_difs += time_difs
 

From 05e5c7bb78b87e140cd3133e7de63efcad972a20 Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Wed, 24 Jul 2024 14:06:40 +0200
Subject: [PATCH 105/204] Don't hashtagify

---
 webtool/lib/template_filters.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/webtool/lib/template_filters.py b/webtool/lib/template_filters.py
index f04546d1a..f7bfc23a2 100644
--- a/webtool/lib/template_filters.py
+++ b/webtool/lib/template_filters.py
@@ -201,16 +201,15 @@ def _jinja2_filter_social_mediafy(body, datasource=""):
 			"mention": "https://instagram.com/"
 		},
 		"tumblr": {
-			"hashtag": "https://tumblr.com/tagged/",
 			"mention": "https://tumblr.com/",
-			"markdown": True
+			"markdown": True # Hashtags aren't linked in the post body
 		},
 		"linkedin": {
 			"hashtag": "https://linkedin.com/feed/hashtag/?keywords=",
 			"mention": "https://linkedin.com/in/"
 		},
 		"telegram": {
-		"markdown": True
+			"markdown": True
 		}
 	}
 
@@ -232,7 +231,6 @@ def _jinja2_filter_social_mediafy(body, datasource=""):
 		tags = sorted(tags, key=lambda x: len(x), reverse=True)
 		for tag in tags:
 			# Match the string, but not if it's preceded by a >, which indicates that we've already added an anchor tag.
-			# This avoids problems with repeated substrings (e.g. #Dog and #DogOwners).
 			body = re.sub(r"(?<!'>)(" + tag + ")", "<a href='%s' target='_blank'>%s</a>" % (base_urls[datasource]["hashtag"] + tag[1:], tag), body)
 
 	# Add @-mention links

From 8263ebc31014fd66c2923b9960c8c184a045aaa3 Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Wed, 24 Jul 2024 16:40:05 +0200
Subject: [PATCH 106/204] Skip duplicate Tumblr posts and format Ask content
 better

---
 datasources/tumblr/search_tumblr.py           | 127 ++++++++++--------
 .../explorer/datasource-templates/tumblr.html |  45 ++++---
 2 files changed, 99 insertions(+), 73 deletions(-)

diff --git a/datasources/tumblr/search_tumblr.py b/datasources/tumblr/search_tumblr.py
index 7604f7d46..bae3b8878 100644
--- a/datasources/tumblr/search_tumblr.py
+++ b/datasources/tumblr/search_tumblr.py
@@ -533,27 +533,43 @@ def get_posts_by_blog(self, blog, max_date=None, min_date=None):
 				self.dataset.update_status("No posts returned by Tumblr - checking whether this is really all (retry %s/48)" % str(retries))
 				continue
 
-			# Append posts to main list
-			else:
+			# Skip posts that we already enountered,
+			# preventing Tumblr API shenanigans or double posts because of
+			# time reductions. Make sure it's no error string, though.
+			new_posts = []
+			for post in posts:
+				# Sometimes the API repsonds just with "meta", "response", or "errors".
+				if isinstance(post, str):
+					self.dataset.update_status("Couldn't add post:", post)
+					retries += 1
+					break
+				else:
+					retries = 0
+					if post["id"] not in self.seen_ids:
+						self.seen_ids.add(post["id"])
+						new_posts.append(post)
 
-				# Get the lowest date
-				max_date = sorted([post["timestamp"] for post in posts])[0]
+			posts = new_posts
 
-				# Manually check if we have a lower date than the min date (`min_date`) already.
-				# This functonality is not natively supported by Tumblr.
-				if min_date:
-					if max_date < min_date:
+			# Append posts to main list
+			# Get the lowest date
+			max_date = sorted([post["timestamp"] for post in posts])[0]
 
-						# Get rid of all the posts that are earlier than the max_date timestamp
-						posts = [post for post in posts if post["timestamp"] >= min_date]
+			# Manually check if we have a lower date than the min date (`min_date`) already.
+			# This functonality is not natively supported by Tumblr.
+			if min_date:
+				if max_date < min_date:
 
-						if posts:
-							all_posts += posts
-						break
+					# Get rid of all the posts that are earlier than the max_date timestamp
+					posts = [post for post in posts if post["timestamp"] >= min_date]
 
-				retries = 0
+					if posts:
+						all_posts += posts
+					break
 
-				all_posts += posts
+			retries = 0
+
+			all_posts += posts
 
 			if len(all_posts) >= self.max_posts:
 				self.max_posts_reached = True
@@ -782,20 +798,12 @@ def map_item(post):
 		answers = ""
 		raw_text = []
 		formatted_text = []
+		body_asked = []
+		author_asked = ""
 		authors_liked = []
-		authors_reblogged = []
 		authors_replied = []
 		replies = []
-
-		# Keep track if blocks belong to another post,
-		# which is stored in `layout`.
-		body_reblogged = []
-		reblogged_text_blocks = []
-		author_reblogged = ""
-		for layout_block in post.get("layout", []):
-			if layout_block["type"] == "ask":
-				reblogged_text_blocks += layout_block["blocks"]
-				author_reblogged = layout_block["attribution"]["blog"]["name"]
+		unknown_blocks = []
 
 		ordered_list_count = 1
 
@@ -811,6 +819,13 @@ def map_item(post):
 		if not content_order:
 			content_order = range(len(post["content"]))
 
+		# Some text blocks are 'ask' blocks
+		ask_blocks = []
+		for layout_block in post.get("layout", []):
+			if layout_block["type"] == "ask":
+				ask_blocks += layout_block["blocks"]
+				author_asked = layout_block["attribution"]["blog"]["name"]
+
 		# We're getting info as Neue Post Format types,
 		# so we need to loop through and join some content 'blocks'.
 		for i in content_order:
@@ -850,10 +865,8 @@ def map_item(post):
 				question = block["question"]
 				answers = ",".join([a["answer_text"] for a in block["answers"]])
 
-			# Text
-			# Here we're adding Markdown formatting.
-			# We skip text that is part of a reblogged post.
-			elif block_type == "text" and i not in reblogged_text_blocks:
+			# Text; we're adding Markdown formatting.
+			elif block_type == "text":
 
 				text = block["text"]
 
@@ -872,26 +885,26 @@ def map_item(post):
 							s = fmt["start"]
 							e = fmt["end"]
 
-							opening = True # So we know if the styles need to be appended or prepended
-							for i in [s, e]:
-								insert_indexes.add(i)
-								i = str(i)
-								if i not in inserts:
-									inserts[i] = ""
+							opening = True # To know if styles need to be appended or prepended
+							for n in [s, e]:
+								insert_indexes.add(n)
+								n = str(n)
+								if n not in inserts:
+									inserts[n] = ""
 								if fmt_type == "link" and opening:
-									inserts[i] = inserts[i] + "["
+									inserts[n] = inserts[n] + "["
 								elif fmt_type == "link" and not opening:
-									inserts[i] = "](" + fmt["url"] + ")" + inserts[i]
+									inserts[n] = "](" + fmt["url"] + ")" + inserts[n]
 								elif fmt_type == "italic":
-									inserts[i] = "*" + inserts[i] if opening else inserts[i] + "*"
+									inserts[n] = "*" + inserts[n] if opening else inserts[n] + "*"
 								elif fmt_type == "bold":
-									inserts[i] = "**" + inserts[i] if opening else inserts[i] + "**"
+									inserts[n] = "**" + inserts[n] if opening else inserts[n] + "**"
 								opening = False
 					if inserts:
 						extra_chars = 0
-						for i, insert in inserts.items():
-							i = int(i) + extra_chars
-							text = text[:i] + insert + text[i:]
+						for n, insert in inserts.items():
+							n = int(n) + extra_chars
+							text = text[:n] + insert + text[n:]
 							extra_chars += len(insert)
 				
 				# Some more 'subtype' formatting
@@ -911,14 +924,18 @@ def map_item(post):
 					elif subtype == "indented":
 						text = "  " + text
 
-				raw_text.append(block["text"])
-				formatted_text.append(text)
+				# If it's an ask text, we're storing it in
+				# a different column
+				if i in ask_blocks:
+					block_type = "ask"
+					body_asked.append(block["text"])
+				else:
+					raw_text.append(block["text"])
+					formatted_text.append(text)
 
-			elif block_type == "text" and i in reblogged_text_blocks:
-				body_reblogged.append(block["text"])
-				# Reblogged text is not considered as an ordered post block,
-				# as it is always put first.
-				continue
+			# Unknown block; can be a third-party app
+			else:
+				unknown_blocks.append(json.dumps(block))
 
 			blocks.append(block_type)
 
@@ -953,15 +970,14 @@ def map_item(post):
 			"is_reblog": True if post.get("original_type") == "note" else "",
 			"body": "\n".join(raw_text),
 			"body_markdown": "\n".join(formatted_text),
-			"body_reblogged": "\n".join(body_reblogged),
+			"body_asked": "\n".join(body_asked),
+			"author_asked": author_asked,
 			"content_order": ",".join(blocks),
-			"author_reblogged": author_reblogged,
 			"tags": ",".join(post.get("tags", "")),
 			"notes": post["note_count"],
 			"like_count": len(authors_liked),
 			"authors_liked": ",".join(authors_liked),
-			"reblog_count": len(authors_reblogged),
-			"authors_reblogged": ",".join(authors_reblogged),
+			#"reblog_count": len(authors_reblogged),
 			"reply_count": len(authors_replied),
 			"authors_replied": ",".join(authors_replied),
 			"replies": "\n\n".join(replies),
@@ -974,7 +990,8 @@ def map_item(post):
 			"audio_urls": ",".join(audio_urls),
 			"audio_artist": ",".join(audio_artists),
 			"poll_question": question,
-			"poll_answers": answers
+			"poll_answers": answers,
+			"unknown_blocks": "\n".join(unknown_blocks)
 		})
 
 	def after_process(self):
diff --git a/webtool/templates/explorer/datasource-templates/tumblr.html b/webtool/templates/explorer/datasource-templates/tumblr.html
index bfec84253..67ff9b161 100644
--- a/webtool/templates/explorer/datasource-templates/tumblr.html
+++ b/webtool/templates/explorer/datasource-templates/tumblr.html
@@ -22,25 +22,12 @@
 
 
 <div class="post-content">
-	<!-- REBLOGGED CONTENT -->
-	{% if post.get("body_reblogged") %}
-	<div class="reblogged-content">
-		<div class="reblog-content">
-			<div class="author author-reblogged">{% if not pseudonymised %}{{ post["author_reblogged"] }}{% else %}<i lass="fa fa-user-secret tooltip-trigger"></i>{% endif %}</div>
-			<div class="body-reblogged">{{ post["body_reblogged"] }}</div>
-		</div>
-	</div>
-	<div class="author-avatar author-reblogged-avatar">
-		{% if not pseudonymised %}
-			<img src="https://api.tumblr.com/v2/blog/{{ post.author_reblogged }}/avatar">
-		{% endif %}
-	</div>
-	{% endif %}
 
 	<!-- CONTENT BLOCKS -->
 	<!-- Keep track of what blocks we've seen so we know what part of the strings to get -->
-	{% set block_counts = namespace({'text': 0, 'image': 0, 'video': 0, 'audio': 0, 'link': 0}) %}
-	{% for block in post.content_order.split(",") %}
+	{% set block_counts = namespace({'text': 0, 'image': 0, 'video': 0, 'audio': 0, 'link': 0, 'ask': 0}) %}
+	{% set content_order = post.content_order.split(",") %}
+	{% for block in content_order %}
 		{% if block == "text" %}
 		<!-- TEXT -->
 			<p class="body">{{ post.get("body_markdown").split("\n")[block_counts.text] | markdown | social_mediafy(datasource='tumblr') | safe }}</p>
@@ -71,10 +58,12 @@
 			{% set url = post.link_urls.split(",")[block_counts.link] %}
 			{% set link_title = post.link_titles.split(",")[block_counts.link] %}
 			{% set link_description = post.link_descriptions.split(",")[block_counts.link] %}
-			<div class="embedded-link">
-				<a href="{{ url }}" target="_blank">{{ link_title }}</a>
+			<a class="embedded-link" href="{{ url }}" target="_blank">
+			<div class="embedded-link-box">
+				{{ link_title }}
 				<div class="link-description">{{ link_description }} <i class="fas fa-external-link-alt"></i></div>
 			</div>
+			</a>
 			{% set block_counts.link = block_counts.link + 1 %}
 		{% elif block == "poll" %}
 		<!-- POLL -->
@@ -87,6 +76,26 @@
 					{% endfor %}
 				</ul>
 			</div>
+		<!-- ASK CONTENT -->
+		{% elif block == "ask" %}
+			{% set start_ask_block = True if loop.index == 0 or content_order[loop.index - 2] != "ask" else False %}
+			{% set end_ask_block = True if loop.index == content_order|length or content_order[loop.index] != "ask" else False %}
+			{% if start_ask_block %}
+			<div class="ask-content">
+				<div class="body-ask">
+				<div class="author-ask"><strong>{% if not pseudonymised %}{{ post["author_asked"] }}{% else %}<i lass="fa fa-user-secret tooltip-trigger"></i>{% endif %}</strong> asked:</div>
+			{% endif %}
+					<p>{{ post.get("body_asked").split("\n")[block_counts.ask] | markdown | social_mediafy(datasource='tumblr') | safe }}</p>
+			{% if end_ask_block %}
+				</div>
+			</div>
+			<div class="author-avatar author-ask-avatar">
+				{% if not pseudonymised %}
+					<img src="https://api.tumblr.com/v2/blog/{{ post.author_asked }}/avatar">
+				{% endif %}
+			</div>
+			{% endif %}
+			{% set block_counts.ask = block_counts.ask + 1 %}
 		{% endif %}
 	{% endfor %}
 

From df7185a6ef85dbaf1b4c95d1d8db9d5824a168eb Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Fri, 26 Jul 2024 11:48:45 +0200
Subject: [PATCH 107/204] More Tumblr Explorer templating

---
 webtool/static/css/explorer/tumblr.css        | 51 ++++++++++++++-----
 .../explorer/datasource-templates/tumblr.html | 20 ++++++--
 2 files changed, 55 insertions(+), 16 deletions(-)

diff --git a/webtool/static/css/explorer/tumblr.css b/webtool/static/css/explorer/tumblr.css
index e1299086b..baeab6eac 100644
--- a/webtool/static/css/explorer/tumblr.css
+++ b/webtool/static/css/explorer/tumblr.css
@@ -68,7 +68,7 @@ header .author {
 
 .media-container.video img {
 	min-height: 300px;
-	width: auto;
+	width: 100%;
 	filter: blur(1.5rem);
 }
 
@@ -108,37 +108,62 @@ header .author {
 	padding: 3px 0px 3px 0px;
 }
 
-.reblogged-content {
+
+.post-content h1 {
+	font-size: 20px;
+	font-weight: bold;
+	background: none;
+	text-align: left;
+	color: black;
+}
+
+.post-content h2 {
+	font-size: 16px;
+	font-weight: bold;
+}
+
+.ask-content {
 	margin-bottom: 19px;
 	display: inline-block;
-	max-width: 400px;
+	max-width: 450px;
+}
+
+.ask-content .body-ask {
 	padding: 25px;
 	background-color: #ededed;
 }
 
-.author-reblogged {
+.ask-content p {
+	margin: 5px 0px 5px 0px;
+}
+
+.ask-content {
+}
+
+.author-ask {
 	padding-bottom: 3px;
 }
 
-.author-reblogged-avatar {
+.author-ask-avatar {
 	display: inline-block;
+	vertical-align: top;
+}
+
+a.embedded-link:hover {
+	text-decoration: none;
 }
 
-.embedded-link {
+.embedded-link-box {
 	padding: 30px;
 	background-color: #001935;
 	color: white;
 	text-align: center;
 	font-size: 18px;
-	border-radius: 15px;
-	margin-bottom: 19px;
-}
-
-.embedded-link a {
-	color: white;
+	border-radius: 5px;
+	margin: 19px 0px 19px 0px;
 }
 
-.embedded-link .link-description {
+.embedded-link-box .link-description {
 	margin-top: 3px;
 	font-size: 14px;
 }
diff --git a/webtool/templates/explorer/datasource-templates/tumblr.html b/webtool/templates/explorer/datasource-templates/tumblr.html
index 67ff9b161..3866f3c70 100644
--- a/webtool/templates/explorer/datasource-templates/tumblr.html
+++ b/webtool/templates/explorer/datasource-templates/tumblr.html
@@ -21,6 +21,20 @@
 </header>
 
 
+<!-- REBLOGGED CONTENT -->
+{% if post.author_trail %}
+
+{% for reblog_author in post.author_trail.split(",") %}
+<div class="post-content reblog">
+	{{ reblog_author }}
+	<p class="body reblog-body">
+		{{ post.body_reblogged.split("\n\n")[loop.index - 1] }}
+	</p>
+</div>
+{% endfor %}
+
+{% endif %}
+
 <div class="post-content">
 
 	<!-- CONTENT BLOCKS -->
@@ -83,15 +97,15 @@
 			{% if start_ask_block %}
 			<div class="ask-content">
 				<div class="body-ask">
-				<div class="author-ask"><strong>{% if not pseudonymised %}{{ post["author_asked"] }}{% else %}<i lass="fa fa-user-secret tooltip-trigger"></i>{% endif %}</strong> asked:</div>
+				<div class="author-ask"><strong>{% if not pseudonymised %}{{ post["author_ask"] }}{% else %}<i lass="fa fa-user-secret tooltip-trigger"></i>{% endif %}</strong> asked:</div>
 			{% endif %}
-					<p>{{ post.get("body_asked").split("\n")[block_counts.ask] | markdown | social_mediafy(datasource='tumblr') | safe }}</p>
+					<p>{{ post.get("body_ask").split("\n")[block_counts.ask] | markdown | social_mediafy(datasource='tumblr') | safe }}</p>
 			{% if end_ask_block %}
 				</div>
 			</div>
 			<div class="author-avatar author-ask-avatar">
 				{% if not pseudonymised %}
-					<img src="https://api.tumblr.com/v2/blog/{{ post.author_asked }}/avatar">
+					<img src="https://api.tumblr.com/v2/blog/{{ post.author_ask }}/avatar">
 				{% endif %}
 			</div>
 			{% endif %}

From f6858badf28e3a661e589c3e229c594085efe03a Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Fri, 26 Jul 2024 11:48:56 +0200
Subject: [PATCH 108/204] Revamp Tumblr search v0.5

---
 datasources/tumblr/search_tumblr.py | 541 ++++++++++++++++++----------
 1 file changed, 346 insertions(+), 195 deletions(-)

diff --git a/datasources/tumblr/search_tumblr.py b/datasources/tumblr/search_tumblr.py
index bae3b8878..f2ad00dc5 100644
--- a/datasources/tumblr/search_tumblr.py
+++ b/datasources/tumblr/search_tumblr.py
@@ -94,32 +94,75 @@ def get_options(cls, parent_dataset=None, user=None):
 			"intro": {
 				"type": UserInput.OPTION_INFO,
 				"help": "Retrieve any kind of Tumblr posts with specific tags or from specific blogs. Gets 100.000 posts "
-						"at max. Insert tags or names of blogs, one on each line. You may insert up to ten tags or "
-						"blogs.\n\nTumblr tags may include whitespace and commas. A `#` before the tag is optional.\n\n"
-						"Tag search only get posts explicitly associated with the exact tag you insert here. Querying "
-						"`gogh` will thus not get posts only tagged with `van gogh`. Keyword search is not "
-						"allowed by the [Tumblr API](https://api.tumblr.com).\n\nIf this 4CAT reached its Tumblr API rate "
-						"limit, try again 24 hours later."
-			},
-			"search_scope": {
-				"type": UserInput.OPTION_CHOICE,
-				"help": "Search by",
-				"options": {
-					"tag": "Tag",
-					"blog": "Blog"
-				},
-				"default": "tag"
+						"at max. You may insert up to ten tags or blogs.\n\n"
+						"Blog-level search also returns reblogs. *Tag-level search only returns original posts*. "
+						"Reblogs of tagged posts can be retrieved via the options below.\n\n"
+						"Tag search only get posts with the exact tag you insert. Querying "
+						"`gogh` will thus not get posts only tagged with `van gogh`.\n\n"
+						"A `#` before a tag is optional. Blog names must start with `@`.\n\n"
+						"Individual posts can be retrieved through the format `@blogname:post_id`.\n\n"
+						"Keyword search is not allowed by the [Tumblr API](https://api.tumblr.com).\n\n"
+						"If this 4CAT reached its Tumblr API rate limit, try again 24 hours later."
 			},
 			"query": {
 				"type": UserInput.OPTION_TEXT_LARGE,
-				"help": "Tags/blogs",
-				"tooltip": "Separate with commas or new lines."
+				"help": "Tags and/or blogs",
+				"tooltip": "E.g. #research tools, #digitalmethods, @the4catblog, @the4catblog:12347714095"
 			},
 			"get_notes": {
 				"type": UserInput.OPTION_TOGGLE,
-				"help": "Get post notes (warning: slow)",
-				"tooltip": "Also retrieve post notes. Likes and replies are added to the original post. Text reblogs are added as new posts.",
+				"help": "Add note and reblog data (warning: slow)",
+				"tooltip": "Add post note data for every post. This includes note metrics (likes, replies, reblogs), "
+							"replies, and reblogged text. "
+							"Blog-level search includes reblogged text by default."
+							"This also allows adding reblogs as new posts",
 				"default": False
+			},
+			"get_reblogs": {
+				"type": UserInput.OPTION_TOGGLE,
+				"help": "Add reblogs of collected posts",
+				"tooltip": "Also include posts that reblogged the posts captured in the initial query. "
+							"Limited to 1,000 reblogs per post.",
+				"default": False,
+				"requires": "get_notes"
+			},
+			"reblog_crawl_depth": {
+				"type": UserInput.OPTION_TEXT,
+				"help": "Reblog crawl depth",
+				"tooltip": "How many levels of reblogs to follow; e.g. a value of 2 adds every reblog "
+							"of the initial post, but also reblogs of these reblogs.",
+				"default": "1",
+				"requires": "get_reblogs",
+				"requires": "get_notes"
+			},
+			"follow_reblogs": {
+				"type": UserInput.OPTION_TOGGLE,
+				"help": "Add posts reblogged by collected posts",
+				"tooltip": "Also include posts that were reblogged by the posts captured in the initial query. "
+							"This adds the entire reblog 'trail' from the initial post to the collected post. "
+							"Only affects blog-level search; tag search only gets original posts.",
+				"default": False,
+				"requires": "get_notes"
+				},
+			"reblog_type": {
+				"type": UserInput.OPTION_CHOICE,
+				"help": "Reblogs to include",
+				"options": {
+					"all": "All (also 'empty' reblogs)",
+					"hashtag_or_text": "Only with added hashtags and/or added text",
+					"hashtag": "Only with added hashtags",
+					"text": "Only with added text"
+				},
+				"tooltip": "What type of reblogs to add to the dataset.",
+				"default": "hashtag_or_text",
+				"requires": "get_notes"
+			},
+			"reblog_outside_daterange": {
+				"type": UserInput.OPTION_TOGGLE,
+				"help": "Allow reblogs and reblogged posts outside of date range",
+				"default": False,
+				"tooltip": "Whether to keep reblogs or reblogged posts that fall outside the date range limits inserted below.",
+				"requires": "get_notes"
 			}
 		}
 
@@ -129,10 +172,10 @@ def get_options(cls, parent_dataset=None, user=None):
 			# No 4CAT set keys for user; let user input their own
 			options["key-info"] = {
 				"type": UserInput.OPTION_INFO,
-				"help": "In order to access the Tumblr API, you need to register an application. You can do so "
-						"[here](https://www.tumblr.com/oauth/apps) and use the keys below. You will first get the OAuth "
+				"help": "To access the Tumblr API, you need to register an application. You can do so "
+						"[here](https://www.tumblr.com/oauth/apps). You will first get the OAuth "
 						"Consumer Key and Secret, and then the User Token Key and Secret [after entering them here](ht"
-									  "tps://api.tumblr.com/console/calls/user/info) and granting access."
+						"tps://api.tumblr.com/console/calls/user/info) and granting access."
 			}
 			options["consumer_key"] = {
 				"type": UserInput.OPTION_TEXT,
@@ -187,15 +230,20 @@ def get_items(self, query):
 
 		# ready our parameters
 		parameters = self.dataset.get_parameters()
-		scope = parameters.get("search_scope", "")
-		queries = parameters.get("query").split(", ")
+		queries = re.split(",|\n", parameters.get("query", ""))
 		get_notes = parameters.get("get_notes", False)
+		get_reblogs = parameters.get("get_reblogs", False)
+		reblog_crawl_depth = parameters.get("reblog_crawl_depth", 0)
+		follow_reblogs = parameters.get("follow_reblogs", False)
+		reblog_type = parameters.get("reblog_type", "hashtag_or_text")
+		reblog_outside_daterange = parameters.get("reblog_outside_daterange", False)
 
 		# Store all info here
 		results = []
 
-		# Store all notes from posts by blogs here
-		all_notes = []
+		# Blog names and post IDs of extra posts we need to fetch
+		# (e.g. in the reblog trail or posts that reblog captured posts)
+		extra_posts = set()
 
 		# Get date parameters
 		min_date = parameters.get("min_date", None)
@@ -224,8 +272,23 @@ def get_items(self, query):
 		# for each tag or blog, get post
 		for query in queries:
 
-			# Get posts per tag
-			if scope == "tag":
+			query = query.strip()
+
+			if query.startswith("@"):
+				blog_name = query[1:]
+
+				# Get a possible post ID
+				post_id = None
+				if ":" in query:
+					blog_name, post_id = blog_name.split(":")
+
+				new_results = self.get_posts_by_blog(blog_name, post_id=post_id, max_date=max_date, min_date=min_date)
+
+			# Get tagged post
+			else:
+				if query.startswith("#"):
+					query = query[1:]
+
 				# Used for getting tagged posts, which uses requests instead.
 				api_key = self.parameters.get("consumer_key")
 				if not api_key:
@@ -233,14 +296,6 @@ def get_items(self, query):
 
 				new_results = self.get_posts_by_tag(query, max_date=max_date, min_date=min_date, api_key=api_key)
 
-			# Get posts per blog
-			elif scope == "blog":
-				new_results = self.get_posts_by_blog(query, max_date=max_date, min_date=min_date)
-
-			else:
-				self.dataset.update_status("Invalid scope")
-				break
-
 			results += new_results
 
 			if self.max_posts_reached:
@@ -250,36 +305,88 @@ def get_items(self, query):
 				self.dataset.update_status("API limit reached")
 				break
 
-		# Loop through the results once to add note data and fetch text reblogs,
-		len_results = len(results) # results will change in length when we add reblogs.
-		for i in range(len_results):
+		# Get note data.
+		# Also potentially store reblogs that we want to add to the dataset
+		if get_notes:
+
+			# Dictionary with the `reblog_key` as key and notes as value.
+			# Notes are the same for all posts in a reblog chain,
+			# so we can use this to check whether we already have the data.
+			retrieved_notes = {}
+
+			for i, post in enumerate(results):
 
-			post = results[i]
+				if self.max_posts_reached:
+					break
+				if self.api_limit_reached:
+					break
 
-			# Get note information
-			if get_notes and not self.max_posts_reached and not self.api_limit_reached:
+				self.dataset.update_status("Retrieving notes for post %i/%i" % (i, len_results))
 
-				# Reblog information is already returned for blog-level searches
-				# and is stored as `notes` in the posts themselves.
-				# Retrieving notes for tag-based posts must be done one-by-one;
-				# fetching them all at once is not supported by the Tumblr API.
-				if not "notes" in post:
-					self.dataset.update_status("Retrieving notes for post %i/%i" % (i, len_results))
+				# We may have already encountered this note-chain
+				# with a different post.
+				if post["reblog_key"] in retrieved_notes:
+					notes = retrieved_notes[post["reblog_key"]]
 
+				# In the case of posts with just a few notes in blog-level search,
+				# we may have all the possible notes in the retrieved JSON.
+				elif len(post["notes"]) == post["note_count"]:
+					notes = post["notes"]
+
+					# Do some conversion that is also done in get_notes
+					for note in notes:
+
+
+				else:
+					# We're getting notes in the "conversation" mode to
+					# prioritise replies and text reblogs.
+					# Only gets first 1,000 replies/text reblogs.
 					notes = self.get_notes(post["blog_name"], post["id"])
-					time.sleep(.2)
+					time.sleep(.1)
+				
+				final_notes = {"notes": notes,
+					"like_count": notes["like_count"],
+					"reply_count": notes["reply_count"],
+					"reblog_count": notes["reblog_count"],
+					}
+
+				# Add to results
+				results[i] |= final_notes
+				retrieved_notes[post["reblog_key"]] = final_notes
+			
+				# Get the full data for text reblogs and add them as new posts
+				if get_reblogs:
 
-					if notes:
-						results[i]["notes"] = notes
+					for note in final_notes["notes"]:
+
+						if reblog_type == "hashtag_or_text":
+
+						elif reblog_type == "hashtag_or_text":
+
+						elif reblog_type == "text":						
+
+						elif reblog_type == "all":
+							pass
+
+						# Potentially skip new posts outside of the date range
+						if reblog_outside_daterange and (max_date or min_date):
+							if not min_date:
+								if note["timestamp"] >= max_date:
+									continue
+							elif not min_date <= note["timestamp"] <= max_date:
+								continue
+
+						extra_posts.add({"blog": note["blog_name"], "post_id": note["post_id"]})
+
+		# Check for reblogged posts in the reblog trail
+		if follow_reblogs:
+			for result in results:
+				if result["trail"]
+
+		# Add reblogged posts and reblogs to dataset
+		for extra_post in extra_posts:
+			print("add")
 
-					# Get the full data for text reblogs and add them as new posts
-					for note in notes:
-						if note["type"] == "reblog":
-							text_reblog = self.get_post_by_id(note["blog_name"], note["post_id"])
-							if text_reblog:
-								results.append(text_reblog)
-							time.sleep(.2)
-		
 		self.job.finish()
 		return results
 
@@ -485,17 +592,25 @@ def get_posts_by_tag(self, tag, max_date=None, min_date=None, api_key=None):
 
 		return all_posts
 
-	def get_posts_by_blog(self, blog, max_date=None, min_date=None):
+	def get_posts_by_blog(self, blog, post_id=None, max_date=None, min_date=None):
 		"""
-		Get Tumblr posts posts with a certain blog
-		:param tag, str: the name of the blog you want to look for
-		:param min_date: a unix timestamp, indicates posts should be min_date this date.
+		Get Tumblr posts from a certain blog
+		:param blog, str: the name of the blog you want to look for
+		:param post_id, str:	the post ID (optional)
 		:param max_date: a unix timestamp, indicates posts should be max_date this date.
+		:param min_date: a unix timestamp, indicates posts should be min_date this date.
 
 		:returns: a dict created from the JSON response
 		"""
+
 		blog = blog + ".tumblr.com"
 
+		if post_id:
+			try:
+				test_id = int(post_id)
+			except TypeError:
+				raise QueryParametersException("Post ID %s is invalid" % post_id)
+
 		if not max_date:
 			max_date = int(time.time())
 
@@ -518,9 +633,19 @@ def get_posts_by_blog(self, blog, max_date=None, min_date=None):
 
 			try:
 				# Use the pytumblr library to make the API call
-				posts = self.client.posts(blog, before=max_date, limit=20, reblog_info=True, notes_info=True, filter="raw", npf=True)
+				posts = self.client.posts(blog, id=post_id, before=max_date, limit=20, notes_info=True, filter="raw", npf=True)
 				posts = posts["posts"]
 
+			except ConnectionRefusedError:
+				retries += 1
+				if post_id:
+					self.failed_posts.append(post_id)
+					self.dataset.update_status("ConnectionRefused: Unable to collect post %s/%s" % (blog, post_id))
+				else:
+					self.dataset.update_status("ConnectionRefused: Unable to collect posts for blog %s before %s" % (blog, max_date))
+				time.sleep(10)
+				continue
+
 			except Exception as e:
 				self.dataset.update_status("Reached the limit of the Tumblr API. Last timestamp: %s" % str(max_date))
 				self.api_limit_reached = True
@@ -551,21 +676,23 @@ def get_posts_by_blog(self, blog, max_date=None, min_date=None):
 
 			posts = new_posts
 
-			# Append posts to main list
-			# Get the lowest date
-			max_date = sorted([post["timestamp"] for post in posts])[0]
+			if not post_id:
 
-			# Manually check if we have a lower date than the min date (`min_date`) already.
-			# This functonality is not natively supported by Tumblr.
-			if min_date:
-				if max_date < min_date:
+				# Append posts to main list
+				# Get the lowest date
+				max_date = sorted([post["timestamp"] for post in posts])[0]
+
+				# Manually check if we have a lower date than the min date (`min_date`) already.
+				# This functonality is not natively supported by Tumblr.
+				if min_date:
+					if max_date < min_date:
 
-					# Get rid of all the posts that are earlier than the max_date timestamp
-					posts = [post for post in posts if post["timestamp"] >= min_date]
+						# Get rid of all the posts that are earlier than the max_date timestamp
+						posts = [post for post in posts if post["timestamp"] >= min_date]
 
-					if posts:
-						all_posts += posts
-					break
+						if posts:
+							all_posts += posts
+						break
 
 			retries = 0
 
@@ -574,54 +701,14 @@ def get_posts_by_blog(self, blog, max_date=None, min_date=None):
 			if len(all_posts) >= self.max_posts:
 				self.max_posts_reached = True
 				break
+			if post_id:
+				break
 
-			self.dataset.update_status("Collected %s posts for blog %s" % str(len(all_posts), blog))
+			self.dataset.update_status("Collected %s posts for blog %s" % (str(len(all_posts)), blog))
 			time.sleep(.2)
 
 		return all_posts
 
-	def get_post_by_id(self, blog_name, post_id):
-		"""
-		Fetch individual posts
-		:param blog_name, str: The blog's name
-		:param id, int: The post ID
-
-		returns result list, a list with a dictionary with the post's information
-		"""
-		if self.interrupted:
-			raise ProcessorInterruptedException("Interrupted while fetching post from Tumblr")
-
-		connection_retries = 0
-
-		while True:
-			if connection_retries >= 5:
-				self.dataset.update_status("Too many connection errors; unable to collect post %s" % post_id)
-				break
-			try:
-				# Request the specific post.
-				post = self.client.posts(blog_name, id=post_id, npf=True, reblog_info=True, notes_info=True, filter="raw")
-		
-			except ConnectionRefusedError:
-				connection_retries += 1
-				self.failed_posts.append(note["id"])
-				self.dataset.update_status("ConnectionRefused: Unable to collect reblogs for post %s" % post_id)
-				time.sleep(10)
-				continue
-			
-			if post:
-				break
-			time.sleep(.2)
-
-		# Tumblr API can sometimes return with this kind of error:
-		# {'meta': {'status': 500, 'msg': 'Server Error'}, 'response': {'error': 'Malformed JSON or HTML was returned.'}}
-		if not post or "posts" not in post:
-			return None
-
-		# Get the first element of the list - it's always one post.
-		result = post["posts"][0]
-
-		return result
-
 	def get_notes(self, blog_id, post_id):
 		"""
 		Gets data on the notes of a specific post.
@@ -631,6 +718,7 @@ def get_notes(self, blog_id, post_id):
 		:returns: a list with dictionaries of notes.
 		"""
 		
+		note_metrics = {}
 		post_notes = []
 		max_date = None
 
@@ -644,6 +732,8 @@ def get_notes(self, blog_id, post_id):
 		max_notes_retries = 10
 		notes_retries = 0
 
+		first_batch = True
+
 		count += 1
 
 		if self.interrupted:
@@ -658,8 +748,12 @@ def get_notes(self, blog_id, post_id):
 
 			# Requests a post's notes
 			try:
-				notes = self.client.notes(blog_id, id=post_id, before_timestamp=max_date)
-				print(notes)
+
+				# Imprtant: we're getting notes in 'conversation' mode to
+				# prioritise replies and reblogs that add text.
+				# We're not interested in the the names of authors that liked the post
+				# or who reblogged without adding content.
+				notes = self.client.notes(blog_id, id=post_id, before_timestamp=max_date, mode="conversation")
 			except ConnectionRefusedError:
 				self.dataset.update_status("Couldn't get notes for post %s (ConnectionRefusedError), trying again" % post_id)
 				notes_retries += 1
@@ -675,6 +769,15 @@ def get_notes(self, blog_id, post_id):
 			if "notes" in notes:
 				notes_retries = 0
 
+				# Add some metrics for the first response
+				if first_batch:
+					note_metrics = {
+						"reply_count": notes["total_replies"],
+						"reblog_count": notes["total_reblogs"],
+						"like_count": notes["total_likes"]
+					}
+					first_batch = False
+
 				for note in notes["notes"]:
 					post_notes.append(note)
 
@@ -693,6 +796,8 @@ def get_notes(self, blog_id, post_id):
 				time.sleep(1)
 				continue
 
+		post_notes = {"notes": post_notes} | note_metrics
+
 		return post_notes
 
 	@staticmethod
@@ -798,10 +903,14 @@ def map_item(post):
 		answers = ""
 		raw_text = []
 		formatted_text = []
-		body_asked = []
-		author_asked = ""
-		authors_liked = []
+		authors_reblogged = []
+		reblog_trail = []
+		body_reblogged = []
+		author_trail = []
+		body_ask = []
+		author_ask = ""
 		authors_replied = []
+		like_count = ""
 		replies = []
 		unknown_blocks = []
 
@@ -824,7 +933,7 @@ def map_item(post):
 		for layout_block in post.get("layout", []):
 			if layout_block["type"] == "ask":
 				ask_blocks += layout_block["blocks"]
-				author_asked = layout_block["attribution"]["blog"]["name"]
+				author_ask = layout_block["attribution"]["blog"]["name"]
 
 		# We're getting info as Neue Post Format types,
 		# so we need to loop through and join some content 'blocks'.
@@ -868,70 +977,16 @@ def map_item(post):
 			# Text; we're adding Markdown formatting.
 			elif block_type == "text":
 
-				text = block["text"]
-
-				if block.get("formatting"):
-
-					# Dict with index numbers as keys where inserts need to be made,
-					# and the replacement strings as values. Done this way so we know
-					# when multiple formatting operations need to be made at the same
-					# index position.
-					insert_indexes = set()
-					inserts = {}
-
-					for fmt in block["formatting"]:
-						fmt_type = fmt["type"]
-						if fmt["type"] in ("link", "bold", "italic"):
-							s = fmt["start"]
-							e = fmt["end"]
-
-							opening = True # To know if styles need to be appended or prepended
-							for n in [s, e]:
-								insert_indexes.add(n)
-								n = str(n)
-								if n not in inserts:
-									inserts[n] = ""
-								if fmt_type == "link" and opening:
-									inserts[n] = inserts[n] + "["
-								elif fmt_type == "link" and not opening:
-									inserts[n] = "](" + fmt["url"] + ")" + inserts[n]
-								elif fmt_type == "italic":
-									inserts[n] = "*" + inserts[n] if opening else inserts[n] + "*"
-								elif fmt_type == "bold":
-									inserts[n] = "**" + inserts[n] if opening else inserts[n] + "**"
-								opening = False
-					if inserts:
-						extra_chars = 0
-						for n, insert in inserts.items():
-							n = int(n) + extra_chars
-							text = text[:n] + insert + text[n:]
-							extra_chars += len(insert)
-				
-				# Some more 'subtype' formatting
-				subtype = block.get("subtype")
-				if subtype:
-					if subtype == "unordered-list-item":
-						text = "- " + text
-					if subtype == "ordered-list-item":
-						text = str(ordered_list_count) + ". " + text
-						ordered_list_count += 1
-					elif subtype == "heading1":
-						text = "#" + text
-					elif subtype == "heading2":
-						text = "##" + text
-					elif subtype == "quote":
-						text = ">" + text
-					elif subtype == "indented":
-						text = "  " + text
+				md_text = SearchTumblr.format_tumblr_text(block)
 
 				# If it's an ask text, we're storing it in
 				# a different column
 				if i in ask_blocks:
 					block_type = "ask"
-					body_asked.append(block["text"])
+					body_ask.append(block["text"])
 				else:
 					raw_text.append(block["text"])
-					formatted_text.append(text)
+					formatted_text.append(md_text)
 
 			# Unknown block; can be a third-party app
 			else:
@@ -939,13 +994,16 @@ def map_item(post):
 
 			blocks.append(block_type)
 
-		# Add note data
+		# Parse note data
 		for note in post.get("notes", []):
+
 			if note["type"] == "like":
-				# Inserting at the start of the list to maintain chronological order.
-				authors_liked.insert(0, note["blog_name"])
-			elif note["type"] in ("posted", "reblog"):
-				# If the original post is a text reblog, it will also show up in the notes.
+				if isinstance(like_count, str):
+					like_count = 0
+				like_count += 1
+
+			if note["type"] in ("posted", "reblog"):
+				# If the post is a text reblog, it will also show up in the notes.
 				# We can skip these since the data is already in the main post dict.
 				if note["blog_name"] != post["blog_name"] and note["timestamp"] != post["timestamp"]:
 					authors_reblogged.insert(0, note["blog_name"])
@@ -953,9 +1011,31 @@ def map_item(post):
 				authors_replied.insert(0, note["blog_name"])
 				replies.insert(0, note["blog_name"] + ": " + note["reply_text"])
 
+		# The API sometimes gives back a 'trail' of reblogged content
+		# This includes reblogged content, but it's not entirely complete (e.g. no hashtags)
+		# so we'll only store the original blog name and its text content.
+		for i, reblog in enumerate(post.get("trail", [])):
+			
+			reblogged_text = []
+
+			if "broken_blog_name" in reblog:
+				reblog_author = reblog["broken_blog_name"]
+			else:
+				reblog_author = reblog["blog"]["name"]
+			
+			for reblog_block in reblog.get("content", []):
+				if reblog_block["type"] == "text":
+					reblogged_text.append(reblog_block["text"])
+
+			if not reblogged_text:
+				reblogged_text = ""
+			body_reblogged.append("\n".join(reblogged_text))
+			
+			author_trail.append(reblog_author)
+
 		return MappedItem({
 			"type": post["original_type"] if "original_type" in post else post["type"],
-			"id": post["id"],
+			"id": post["id"] if "id" in post else post["post"]["id"],
 			"author": post["blog_name"],
 			"author_avatar_url": "https://api.tumblr.com/v2/blog/" + post["blog_name"] + "/avatar",
 			"thread_id": post["reblog_key"],
@@ -967,18 +1047,21 @@ def map_item(post):
 			"author_last_updated": post["blog"]["updated"],
 			"post_url": post["post_url"],
 			"post_slug": post["slug"],
-			"is_reblog": True if post.get("original_type") == "note" else "",
+			"is_reblog": True if post.get("parent_post_url") else "",
+			"reblog_key": post["reblog_key"],
 			"body": "\n".join(raw_text),
 			"body_markdown": "\n".join(formatted_text),
-			"body_asked": "\n".join(body_asked),
-			"author_asked": author_asked,
+			"body_reblogged": "\n\n".join(body_reblogged),
+			"author_trail": ",".join(author_trail),
+			"parent_post_url": post.get("parent_post_url", ""),
+			"authors_reblogged": ",".join(authors_reblogged),
+			"body_ask": "\n".join(body_ask),
+			"author_ask": author_ask,
 			"content_order": ",".join(blocks),
 			"tags": ",".join(post.get("tags", "")),
 			"notes": post["note_count"],
-			"like_count": len(authors_liked),
-			"authors_liked": ",".join(authors_liked),
-			#"reblog_count": len(authors_reblogged),
-			"reply_count": len(authors_replied),
+			"like_count": like_count,
+			"reply_count": len(authors_replied) if authors_replied else "",
 			"authors_replied": ",".join(authors_replied),
 			"replies": "\n\n".join(replies),
 			"link_urls": ",".join(link_urls),
@@ -994,6 +1077,74 @@ def map_item(post):
 			"unknown_blocks": "\n".join(unknown_blocks)
 		})
 
+	@staticmethod
+	def format_tumblr_text(text_content):
+		"""
+		Format text content according to Tumblr's Neue Post Format definition.
+
+		:param content, list:	The list of content as returned by the Tumblr API (can also be part of a `trail`)
+		:returns dict
+
+		"""
+
+		text = text_content["text"]
+
+		if text_content.get("formatting"):
+
+			# Dict with index numbers as keys where inserts need to be made,
+			# and the replacement strings as values. Done this way so we know
+			# when multiple formatting operations need to be made at the same
+			# index position.
+			insert_indexes = set()
+			inserts = {}
+
+			for fmt in text_content["formatting"]:
+				fmt_type = fmt["type"]
+				if fmt["type"] in ("link", "bold", "italic"):
+					s = fmt["start"]
+					e = fmt["end"]
+
+					opening = True # To know if styles need to be appended or prepended
+					for n in [s, e]:
+						insert_indexes.add(n)
+						n = str(n)
+						if n not in inserts:
+							inserts[n] = ""
+						if fmt_type == "link" and opening:
+							inserts[n] = inserts[n] + "["
+						elif fmt_type == "link" and not opening:
+							inserts[n] = "](" + fmt["url"] + ")" + inserts[n]
+						elif fmt_type == "italic":
+							inserts[n] = "*" + inserts[n] if opening else inserts[n] + "*"
+						elif fmt_type == "bold":
+							inserts[n] = "**" + inserts[n] if opening else inserts[n] + "**"
+						opening = False
+			if inserts:
+				extra_chars = 0
+				for n, insert in inserts.items():
+					n = int(n) + extra_chars
+					text = text[:n] + insert + text[n:]
+					extra_chars += len(insert)
+		
+		# Some more 'subtype' formatting
+		subtype = text_content.get("subtype")
+		if subtype:
+			if subtype == "unordered-list-item":
+				text = "- " + text
+			if subtype == "ordered-list-item":
+				text = str(ordered_list_count) + ". " + text
+				ordered_list_count += 1
+			elif subtype == "heading1":
+				text = "#" + text
+			elif subtype == "heading2":
+				text = "##" + text
+			elif subtype == "quote":
+				text = ">" + text
+			elif subtype == "indented":
+				text = "  " + text
+
+		return text
+
 	def after_process(self):
 		"""
 		Override of the same function in processor.py

From 7d9df2d1f585d7b55ccca4bf5b5d5dd30b0b9126 Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Fri, 26 Jul 2024 17:57:16 +0200
Subject: [PATCH 109/204] Improve Tumblr querying

---
 datasources/tumblr/search_tumblr.py | 272 ++++++++++++++++------------
 1 file changed, 157 insertions(+), 115 deletions(-)

diff --git a/datasources/tumblr/search_tumblr.py b/datasources/tumblr/search_tumblr.py
index f2ad00dc5..d4997989b 100644
--- a/datasources/tumblr/search_tumblr.py
+++ b/datasources/tumblr/search_tumblr.py
@@ -107,62 +107,49 @@ def get_options(cls, parent_dataset=None, user=None):
 			"query": {
 				"type": UserInput.OPTION_TEXT_LARGE,
 				"help": "Tags and/or blogs",
-				"tooltip": "E.g. #research tools, #digitalmethods, @the4catblog, @the4catblog:12347714095"
+				"tooltip": "E.g. #research tools, @4catblog, @4catblog:12347714095"
 			},
 			"get_notes": {
 				"type": UserInput.OPTION_TOGGLE,
 				"help": "Add note and reblog data (warning: slow)",
-				"tooltip": "Add post note data for every post. This includes note metrics (likes, replies, reblogs), "
+				"tooltip": "Add post and reblog note data for every post. This includes note metrics (likes, replies, reblogs), "
 							"replies, and reblogged text. "
-							"Blog-level search includes reblogged text by default."
-							"This also allows adding reblogs as new posts",
+							"Blog-level search includes reblogged text by default. "
+							"Enables adding reblogs as new posts",
 				"default": False
 			},
 			"get_reblogs": {
 				"type": UserInput.OPTION_TOGGLE,
 				"help": "Add reblogs of collected posts",
-				"tooltip": "Also include posts that reblogged the posts captured in the initial query. "
+				"tooltip": "Add posts that reblogged posts from the initial query to the dataset. "
 							"Limited to 1,000 reblogs per post.",
-				"default": False,
-				"requires": "get_notes"
-			},
-			"reblog_crawl_depth": {
-				"type": UserInput.OPTION_TEXT,
-				"help": "Reblog crawl depth",
-				"tooltip": "How many levels of reblogs to follow; e.g. a value of 2 adds every reblog "
-							"of the initial post, but also reblogs of these reblogs.",
-				"default": "1",
-				"requires": "get_reblogs",
-				"requires": "get_notes"
+				"requires": "get_notes==true",
+				"default": False
 			},
-			"follow_reblogs": {
-				"type": UserInput.OPTION_TOGGLE,
-				"help": "Add posts reblogged by collected posts",
-				"tooltip": "Also include posts that were reblogged by the posts captured in the initial query. "
-							"This adds the entire reblog 'trail' from the initial post to the collected post. "
-							"Only affects blog-level search; tag search only gets original posts.",
-				"default": False,
-				"requires": "get_notes"
-				},
 			"reblog_type": {
 				"type": UserInput.OPTION_CHOICE,
-				"help": "Reblogs to include",
+				"help": "Reblogs to add",
 				"options": {
-					"all": "All (also 'empty' reblogs)",
-					"hashtag_or_text": "Only with added hashtags and/or added text",
-					"hashtag": "Only with added hashtags",
-					"text": "Only with added text"
+					"text": "Only with added text",
+					"tag_or_text": "Only with added text and/or added hashtags"
 				},
 				"tooltip": "What type of reblogs to add to the dataset.",
-				"default": "hashtag_or_text",
-				"requires": "get_notes"
+				"requires": "get_reblogs==true",
+				"default": "text"
 			},
+			"follow_reblogs": {
+				"type": UserInput.OPTION_TOGGLE,
+				"help": "Add posts reblogged by collected posts",
+				"tooltip": "Also include all posts that were reblogged by the posts captured in the initial query. "
+							"This adds the entire reblog 'trail' from the initial post to the collected post. "
+							"Only affects blog-level search; tag-level search only gets original posts.",
+				"default": False
+				},
 			"reblog_outside_daterange": {
 				"type": UserInput.OPTION_TOGGLE,
 				"help": "Allow reblogs and reblogged posts outside of date range",
-				"default": False,
-				"tooltip": "Whether to keep reblogs or reblogged posts that fall outside the date range limits inserted below.",
-				"requires": "get_notes"
+				"tooltip": "Whether to keep reblogs or reblogged posts that fall outside the optional date range limit inserted below.",
+				"default": True
 			}
 		}
 
@@ -235,7 +222,7 @@ def get_items(self, query):
 		get_reblogs = parameters.get("get_reblogs", False)
 		reblog_crawl_depth = parameters.get("reblog_crawl_depth", 0)
 		follow_reblogs = parameters.get("follow_reblogs", False)
-		reblog_type = parameters.get("reblog_type", "hashtag_or_text")
+		reblog_type = parameters.get("reblog_type", False)
 		reblog_outside_daterange = parameters.get("reblog_outside_daterange", False)
 
 		# Store all info here
@@ -243,7 +230,7 @@ def get_items(self, query):
 
 		# Blog names and post IDs of extra posts we need to fetch
 		# (e.g. in the reblog trail or posts that reblog captured posts)
-		extra_posts = set()
+		extra_posts = []
 
 		# Get date parameters
 		min_date = parameters.get("min_date", None)
@@ -281,7 +268,6 @@ def get_items(self, query):
 				post_id = None
 				if ":" in query:
 					blog_name, post_id = blog_name.split(":")
-
 				new_results = self.get_posts_by_blog(blog_name, post_id=post_id, max_date=max_date, min_date=min_date)
 
 			# Get tagged post
@@ -305,13 +291,25 @@ def get_items(self, query):
 				self.dataset.update_status("API limit reached")
 				break
 
+		# Check for reblogged posts in the reblog trail
+		if follow_reblogs:
+			for result in results:
+				# The post rail is stored in the trail list
+				for trail_post in result.get("trail", []):
+					# Some posts or blogs have been deleted; skip these
+					if not "broken_blog_name" in trail_post:
+						if trail_post["id"] not in self.seen_ids:
+							extra_posts.add({"blog": trail_post["blog"], "id": trail_post["post"]["id"]})
+
 		# Get note data.
-		# Also potentially store reblogs that we want to add to the dataset
+		# Blog-level searches already have some note data, like reblogged text,
+		# but not everything (like replies), so we're going to retrieve these here as well.
+		# Also store IDs of reblogs/reblogged posts that we want to add.
 		if get_notes:
 
-			# Dictionary with the `reblog_key` as key and notes as value.
-			# Notes are the same for all posts in a reblog chain,
-			# so we can use this to check whether we already have the data.
+			# Create a dictionary with the `reblog_key` as key and notes as value.
+			# Notes are the same for all posts in a reblog chain.
+			# This means that we may not have to re-query the same data.
 			retrieved_notes = {}
 
 			for i, post in enumerate(results):
@@ -321,52 +319,74 @@ def get_items(self, query):
 				if self.api_limit_reached:
 					break
 
-				self.dataset.update_status("Retrieving notes for post %i/%i" % (i, len_results))
+				self.dataset.update_status("Retrieving notes for post %i/%i" % (i+1, len(results)))
 
 				# We may have already encountered this note-chain
 				# with a different post.
 				if post["reblog_key"] in retrieved_notes:
 					notes = retrieved_notes[post["reblog_key"]]
 
-				# In the case of posts with just a few notes in blog-level search,
+				# In the case of posts with just a few notes,
 				# we may have all the possible notes in the retrieved JSON.
 				elif len(post["notes"]) == post["note_count"]:
-					notes = post["notes"]
-
-					# Do some conversion that is also done in get_notes
-					for note in notes:
-
+					# Add some metrics, like done in `get_notes`.
+					notes = {
+						"notes": post["notes"],
+						"reply_count": len([n for n in notes if n["type"] == "reply"]),
+						"reblog_count": len([n for n in notes if n["type"] == "reblog"]),
+						"like_count": len([n for n in notes if n["type"] == "like"])
+					}
 
 				else:
-					# We're getting notes in the "conversation" mode to
-					# prioritise replies and text reblogs.
+					# Get notes via the API
 					# Only gets first 1,000 replies/text reblogs.
-					notes = self.get_notes(post["blog_name"], post["id"])
-					time.sleep(.1)
-				
-				final_notes = {"notes": notes,
-					"like_count": notes["like_count"],
-					"reply_count": notes["reply_count"],
-					"reblog_count": notes["reblog_count"],
-					}
 
-				# Add to results
-				results[i] |= final_notes
-				retrieved_notes[post["reblog_key"]] = final_notes
-			
-				# Get the full data for text reblogs and add them as new posts
+					# We're using different querying modes since
+					# it'll speed up the process. The fastest is 
+					# `conversation`, which prioritises text reblogs and
+					# replies, and also provides metrics on like and reblog counts;
+					# we'll use this as default. However, if the user
+					# has indicated they want to add reblogs with hashtags,
+					# we'll also have to use the `reblogs_with_tags` mode.
+					seen_notes = set()
+					notes = self.get_notes(post["blog_name"], post["id"], mode="conversation", max_notes=1000)
+					for note in notes["notes"]:
+						if note["type"] == "reblog":
+							seen_notes.add(note["post_id"])
+
+					# Get tag-only reblogs; these aren't returned in `conversation` mode.
+					if reblog_type == "tag_or_text":
+						tag_notes = self.get_notes(post["blog_name"], post["id"], mode="reblogs_with_tags", max_notes=1000)
+						for tag_note in tag_notes:
+							if tag_note["post_id"] not in seen_notes:
+								notes["notes"].append(tag_note)
+				
+				# Add to posts
+				results[i] = {**results[i], **notes}
+				retrieved_notes[post["reblog_key"]] = notes
+				
+				# Get the full data for certain reblogs and add them as new posts
 				if get_reblogs:
 
-					for note in final_notes["notes"]:
+					for note in notes["notes"]:
 
-						if reblog_type == "hashtag_or_text":
+						# Skip replies and likes
+						if note["type"] != "reblog":
+							continue
 
-						elif reblog_type == "hashtag_or_text":
+						elif reblog_type == "tag_or_text":
+							# Skip reblogs without tags or text
+							if not note.get("tags") and not note.get("added_text"):
+								continue
 
-						elif reblog_type == "text":						
+						elif reblog_type == "text":
+							# Skip reblogs without added text
+							if not note.get("added_text"):
+								continue
 
-						elif reblog_type == "all":
-							pass
+						# Skip posts that we already collected
+						if note["post_id"] in self.seen_ids:
+							continue
 
 						# Potentially skip new posts outside of the date range
 						if reblog_outside_daterange and (max_date or min_date):
@@ -376,23 +396,32 @@ def get_items(self, query):
 							elif not min_date <= note["timestamp"] <= max_date:
 								continue
 
-						extra_posts.add({"blog": note["blog_name"], "post_id": note["post_id"]})
-
-		# Check for reblogged posts in the reblog trail
-		if follow_reblogs:
-			for result in results:
-				if result["trail"]
+						extra_posts.append({"blog": note["blog_name"], "id": note["post_id"]})
 
 		# Add reblogged posts and reblogs to dataset
-		for extra_post in extra_posts:
-			print("add")
+		for i, extra_post in enumerate(extra_posts):
+			
+			self.dataset.update_status("Adding %s/%s reblogged posts to the dataset" % (i, len(extra_posts)))
+
+			if extra_post["id"] not in self.seen_ids:
+				new_post = self.get_posts_by_blog(extra_post["blog"], extra_post["id"])
+
+				if new_post:
+					new_post = new_post[0]
+
+					# Add note data; these should already be retrieved above
+					if get_notes:
+						new_post = {**new_post, **retrieved_notes[new_post["reblog_key"]]}
+
+					results.append(new_post)
+					self.seen_ids.add(extra_post["id"])
 
 		self.job.finish()
 		return results
 
 	def get_posts_by_tag(self, tag, max_date=None, min_date=None, api_key=None):
 		"""
-		Get Tumblr posts posts with a certain tag
+		Get Tumblr posts posts with a certain tag.
 		:param tag, str: the tag you want to look for
 		:param min_date: a unix timestamp, indicates posts should be min_date this date.
 		:param max_date: a unix timestamp, indicates posts should be max_date this date.
@@ -483,7 +512,7 @@ def get_posts_by_tag(self, tag, max_date=None, min_date=None, api_key=None):
 			# 	self.api_limit_reached = True
 			# 	break
 
-			# Make sure the Tumblr API doesn't magically stop at an earlier date
+			# Make sure the Tumblr API doesn't magically stop even if earlier posts are available
 			if not posts:
 
 				date_retries += 1
@@ -548,7 +577,7 @@ def get_posts_by_tag(self, tag, max_date=None, min_date=None, api_key=None):
 
 					time_difs.append(time_dif)
 
-				# To start a new query
+				# Stop if we found nothing for this query
 				if not posts:
 					break
 
@@ -633,7 +662,7 @@ def get_posts_by_blog(self, blog, post_id=None, max_date=None, min_date=None):
 
 			try:
 				# Use the pytumblr library to make the API call
-				posts = self.client.posts(blog, id=post_id, before=max_date, limit=20, notes_info=True, filter="raw", npf=True)
+				posts = self.client.posts(blog, id=post_id, before=max_date, limit=20, reblog_info=True, notes_info=True, filter="raw", npf=True)
 				posts = posts["posts"]
 
 			except ConnectionRefusedError:
@@ -709,11 +738,14 @@ def get_posts_by_blog(self, blog, post_id=None, max_date=None, min_date=None):
 
 		return all_posts
 
-	def get_notes(self, blog_id, post_id):
+	def get_notes(self, blog_id, post_id, mode="conversation", max_notes=1000):
 		"""
 		Gets data on the notes of a specific post.
-		:param blog_id, str: The ID of the blog.
-		:param post_id, str: The ID of the post.
+		:param blog_id, str: 	The ID of the blog.
+		:param post_id, str:	The ID of the post.
+		:param mode, str:		The type of notes that get priority.
+								`conversation` prioritises text reblogs and replies.
+		:param mode, max_notes:	Maximum amount of notes to return.
 
 		:returns: a list with dictionaries of notes.
 		"""
@@ -733,6 +765,7 @@ def get_notes(self, blog_id, post_id):
 		notes_retries = 0
 
 		first_batch = True
+		note_metrics = {}
 
 		count += 1
 
@@ -746,14 +779,14 @@ def get_notes(self, blog_id, post_id):
 				self.failed_posts.append(post_id)
 				break
 
-			# Requests a post's notes
+			# Request a post's notes
 			try:
 
-				# Imprtant: we're getting notes in 'conversation' mode to
+				# Important: we're getting notes in 'conversation' mode to
 				# prioritise replies and reblogs that add text.
 				# We're not interested in the the names of authors that liked the post
 				# or who reblogged without adding content.
-				notes = self.client.notes(blog_id, id=post_id, before_timestamp=max_date, mode="conversation")
+				notes = self.client.notes(blog_id, id=post_id, before_timestamp=max_date, mode=mode)
 			except ConnectionRefusedError:
 				self.dataset.update_status("Couldn't get notes for post %s (ConnectionRefusedError), trying again" % post_id)
 				notes_retries += 1
@@ -770,19 +803,31 @@ def get_notes(self, blog_id, post_id):
 				notes_retries = 0
 
 				# Add some metrics for the first response
-				if first_batch:
+				if first_batch and mode == "conversation":
 					note_metrics = {
-						"reply_count": notes["total_replies"],
+						"note_count": notes["total_notes"],
 						"reblog_count": notes["total_reblogs"],
-						"like_count": notes["total_likes"]
+						"like_count": notes["total_likes"],
+						"reply_count": 0
 					}
 					first_batch = False
 
+				# Add notes
 				for note in notes["notes"]:
+					if mode == "converstaion" and note["type"] == "reply":
+						note_metrics["reply_count"] += 1
+
 					post_notes.append(note)
 
+				# `conversation` mode groups likes and reblogs without commentary
+				# in the `rollup_notes` list. We're adding reblogs to the post notes.
+				if mode == "conversation":
+					if "rollup_notes" in notes and "notes" in notes["rollup_notes"][0]:
+						for note in notes["rollup_notes"][0]["notes"]:
+							if note["type"] == "reblog":
+								post_notes.append(note)
+
 				if notes.get("_links"):
-					print("more notes for " + str(blog_id) + " " + str(post_id))
 					max_date = notes["_links"]["next"]["query_params"]["before_timestamp"]
 					time.sleep(.2)
 
@@ -796,7 +841,8 @@ def get_notes(self, blog_id, post_id):
 				time.sleep(1)
 				continue
 
-		post_notes = {"notes": post_notes} | note_metrics
+		# Merge notes and note metrics
+		post_notes = {"notes": post_notes, **note_metrics}
 
 		return post_notes
 
@@ -827,7 +873,10 @@ def connect_to_tumblr(self):
 
 		self.client = pytumblr.TumblrRestClient(*config_keys)
 
-		client_info = self.client.info()
+		try:
+			client_info = self.client.info()
+		except Exception as e:
+			raise ConnectionRefusedError("Couldn't connect to Tumblr API, (%s)" % e)
 
 		# Check if there's any errors
 		if client_info.get("meta"):
@@ -906,7 +955,7 @@ def map_item(post):
 		authors_reblogged = []
 		reblog_trail = []
 		body_reblogged = []
-		author_trail = []
+		reblog_trail = []
 		body_ask = []
 		author_ask = ""
 		authors_replied = []
@@ -961,6 +1010,7 @@ def map_item(post):
 					video_thumb_urls.append(block["poster"][0]["url"])
 				else:
 					video_thumb_urls.append("")
+
 			# Embedded link
 			elif block_type == "link":
 				link_urls.append(block["url"])
@@ -994,22 +1044,11 @@ def map_item(post):
 
 			blocks.append(block_type)
 
-		# Parse note data
+		# Parse some note
 		for note in post.get("notes", []):
-
-			if note["type"] == "like":
-				if isinstance(like_count, str):
-					like_count = 0
-				like_count += 1
-
-			if note["type"] in ("posted", "reblog"):
-				# If the post is a text reblog, it will also show up in the notes.
-				# We can skip these since the data is already in the main post dict.
-				if note["blog_name"] != post["blog_name"] and note["timestamp"] != post["timestamp"]:
-					authors_reblogged.insert(0, note["blog_name"])
-			elif note["type"] == "reply":
+			if note["type"] == "reply":
 				authors_replied.insert(0, note["blog_name"])
-				replies.insert(0, note["blog_name"] + ": " + note["reply_text"])
+				replies.insert(0, note["reply_text"])
 
 		# The API sometimes gives back a 'trail' of reblogged content
 		# This includes reblogged content, but it's not entirely complete (e.g. no hashtags)
@@ -1031,7 +1070,7 @@ def map_item(post):
 				reblogged_text = ""
 			body_reblogged.append("\n".join(reblogged_text))
 			
-			author_trail.append(reblog_author)
+			reblog_trail.append(reblog_author)
 
 		return MappedItem({
 			"type": post["original_type"] if "original_type" in post else post["type"],
@@ -1052,16 +1091,16 @@ def map_item(post):
 			"body": "\n".join(raw_text),
 			"body_markdown": "\n".join(formatted_text),
 			"body_reblogged": "\n\n".join(body_reblogged),
-			"author_trail": ",".join(author_trail),
+			"reblog_trail": ",".join(reblog_trail),
 			"parent_post_url": post.get("parent_post_url", ""),
-			"authors_reblogged": ",".join(authors_reblogged),
 			"body_ask": "\n".join(body_ask),
 			"author_ask": author_ask,
 			"content_order": ",".join(blocks),
 			"tags": ",".join(post.get("tags", "")),
-			"notes": post["note_count"],
-			"like_count": like_count,
-			"reply_count": len(authors_replied) if authors_replied else "",
+			"note_count": post["note_count"],
+			"reblog_count": post.get("reblog_count", ""),
+			"like_count": post.get("like_count", ""),
+			"reply_count": post.get("reply_count", ""),
 			"authors_replied": ",".join(authors_replied),
 			"replies": "\n\n".join(replies),
 			"link_urls": ",".join(link_urls),
@@ -1081,6 +1120,7 @@ def map_item(post):
 	def format_tumblr_text(text_content):
 		"""
 		Format text content according to Tumblr's Neue Post Format definition.
+		Returns text as mardkown.
 
 		:param content, list:	The list of content as returned by the Tumblr API (can also be part of a `trail`)
 		:returns dict
@@ -1119,6 +1159,8 @@ def format_tumblr_text(text_content):
 						elif fmt_type == "bold":
 							inserts[n] = "**" + inserts[n] if opening else inserts[n] + "**"
 						opening = False
+
+			# Change text
 			if inserts:
 				extra_chars = 0
 				for n, insert in inserts.items():

From 6fe891c35ba1a7ee714b0bfb1f02a2a6b0c18659 Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Tue, 30 Jul 2024 11:02:36 +0200
Subject: [PATCH 110/204] Change options for Tumblr

---
 datasources/tumblr/search_tumblr.py | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/datasources/tumblr/search_tumblr.py b/datasources/tumblr/search_tumblr.py
index d4997989b..c28465de0 100644
--- a/datasources/tumblr/search_tumblr.py
+++ b/datasources/tumblr/search_tumblr.py
@@ -120,8 +120,8 @@ def get_options(cls, parent_dataset=None, user=None):
 			},
 			"get_reblogs": {
 				"type": UserInput.OPTION_TOGGLE,
-				"help": "Add reblogs of collected posts",
-				"tooltip": "Add posts that reblogged posts from the initial query to the dataset. "
+				"help": "Add reblogs",
+				"tooltip": "Add posts that reblogged the initial results to the dataset. "
 							"Limited to 1,000 reblogs per post.",
 				"requires": "get_notes==true",
 				"default": False
@@ -131,7 +131,7 @@ def get_options(cls, parent_dataset=None, user=None):
 				"help": "Reblogs to add",
 				"options": {
 					"text": "Only with added text",
-					"tag_or_text": "Only with added text and/or added hashtags"
+					"text_or_tag": "Only with added text and/or added hashtags"
 				},
 				"tooltip": "What type of reblogs to add to the dataset.",
 				"requires": "get_reblogs==true",
@@ -140,7 +140,7 @@ def get_options(cls, parent_dataset=None, user=None):
 			"follow_reblogs": {
 				"type": UserInput.OPTION_TOGGLE,
 				"help": "Add posts reblogged by collected posts",
-				"tooltip": "Also include all posts that were reblogged by the posts captured in the initial query. "
+				"tooltip": "Add all posts that were reblogged by the initial results to the dataset. "
 							"This adds the entire reblog 'trail' from the initial post to the collected post. "
 							"Only affects blog-level search; tag-level search only gets original posts.",
 				"default": False
@@ -298,8 +298,8 @@ def get_items(self, query):
 				for trail_post in result.get("trail", []):
 					# Some posts or blogs have been deleted; skip these
 					if not "broken_blog_name" in trail_post:
-						if trail_post["id"] not in self.seen_ids:
-							extra_posts.add({"blog": trail_post["blog"], "id": trail_post["post"]["id"]})
+						if trail_post["post_id"] not in self.seen_ids:
+							extra_posts.add({"blog": trail_post["blog"], "id": trail_post["post_id"]})
 
 		# Get note data.
 		# Blog-level searches already have some note data, like reblogged text,
@@ -355,7 +355,7 @@ def get_items(self, query):
 							seen_notes.add(note["post_id"])
 
 					# Get tag-only reblogs; these aren't returned in `conversation` mode.
-					if reblog_type == "tag_or_text":
+					if reblog_type == "text_or_tag":
 						tag_notes = self.get_notes(post["blog_name"], post["id"], mode="reblogs_with_tags", max_notes=1000)
 						for tag_note in tag_notes:
 							if tag_note["post_id"] not in seen_notes:
@@ -374,7 +374,7 @@ def get_items(self, query):
 						if note["type"] != "reblog":
 							continue
 
-						elif reblog_type == "tag_or_text":
+						elif reblog_type == "text_or_tag":
 							# Skip reblogs without tags or text
 							if not note.get("tags") and not note.get("added_text"):
 								continue
@@ -767,8 +767,6 @@ def get_notes(self, blog_id, post_id, mode="conversation", max_notes=1000):
 		first_batch = True
 		note_metrics = {}
 
-		count += 1
-
 		if self.interrupted:
 			raise ProcessorInterruptedException("Interrupted while fetching post notes from Tumblr")
 
@@ -814,9 +812,10 @@ def get_notes(self, blog_id, post_id, mode="conversation", max_notes=1000):
 
 				# Add notes
 				for note in notes["notes"]:
-					if mode == "converstaion" and note["type"] == "reply":
+					if mode == "conversation" and note["type"] == "reply":
 						note_metrics["reply_count"] += 1
 
+					count += 1
 					post_notes.append(note)
 
 				# `conversation` mode groups likes and reblogs without commentary
@@ -825,10 +824,15 @@ def get_notes(self, blog_id, post_id, mode="conversation", max_notes=1000):
 					if "rollup_notes" in notes and "notes" in notes["rollup_notes"][0]:
 						for note in notes["rollup_notes"][0]["notes"]:
 							if note["type"] == "reblog":
+								count += 1
 								post_notes.append(note)
 
+				if count >= max_notes:
+					break
+
 				if notes.get("_links"):
 					max_date = notes["_links"]["next"]["query_params"]["before_timestamp"]
+					self.dataset.update_status("Added %s notes for post %s:%s" % (count, blog_id, post_id))
 					time.sleep(.2)
 
 				# If there's no `_links` key, that's all.

From 622f7791a62beee46fe900a3e91f208ac41f8ce7 Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Tue, 30 Jul 2024 16:35:32 +0200
Subject: [PATCH 111/204] Revamp Tumblr search and allow reblogs in Explorer
 template

---
 datasources/tumblr/search_tumblr.py           | 215 ++++++++++--------
 webtool/lib/template_filters.py               |   3 +-
 webtool/static/css/explorer/tumblr.css        |  88 +++++--
 .../explorer/datasource-templates/tumblr.html |  90 ++++++--
 4 files changed, 252 insertions(+), 144 deletions(-)

diff --git a/datasources/tumblr/search_tumblr.py b/datasources/tumblr/search_tumblr.py
index c28465de0..dccd4d4da 100644
--- a/datasources/tumblr/search_tumblr.py
+++ b/datasources/tumblr/search_tumblr.py
@@ -1,7 +1,7 @@
 """
 Search Tumblr via its API
 
-Can fetch posts from specific blogs or with specific hashtags
+Can fetch posts from specific blogs or with specific tags
 
 For Tumblr API documentation, see https://www.tumblr.com/docs/en/api/v2
 For Neue Post Format documentation, see https://github.com/tumblr/docs/blob/master/npf-spec.md
@@ -35,7 +35,7 @@ class SearchTumblr(Search):
 	type = "tumblr-search"  # job ID
 	category = "Search"  # category
 	title = "Search Tumblr"  # title displayed in UI
-	description = "Retrieve Tumblr posts by hashtag or blog."  # description displayed in UI
+	description = "Retrieve Tumblr posts by tags or blogs."  # description displayed in UI
 	extension = "ndjson"  # extension of result file, used internally and in UI
 	is_local = False	# Whether this datasource is locally scraped
 	is_static = False	# Whether this datasource is still updated
@@ -47,6 +47,7 @@ class SearchTumblr(Search):
 	max_retries = 3 # For API and connection retries.
 	max_date_retries = 96 + 150 # For checking dates. 96 time retries of -6 hours (24 days), plus 150 extra for 150 weeks (~3 years).
 	max_posts = 1000000
+	max_reblogs = 1000
 
 	max_posts_reached = False
 	api_limit_reached = False
@@ -111,18 +112,19 @@ def get_options(cls, parent_dataset=None, user=None):
 			},
 			"get_notes": {
 				"type": UserInput.OPTION_TOGGLE,
-				"help": "Add note and reblog data (warning: slow)",
-				"tooltip": "Add post and reblog note data for every post. This includes note metrics (likes, replies, reblogs), "
-							"replies, and reblogged text. "
-							"Blog-level search includes reblogged text by default. "
-							"Enables adding reblogs as new posts",
+				"help": "Add note data (warning: slow)",
+				"tooltip": "Add note data for every post. This includes note metrics, "
+							"replies, reblogged text, and reblogged images. "
+							"Blog- and id-level search includes reblogged text by default. "
+							"Enables adding reblogs as new posts "
+							"Limited to the first 1,000 reblogs per post.",
 				"default": False
 			},
 			"get_reblogs": {
 				"type": UserInput.OPTION_TOGGLE,
 				"help": "Add reblogs",
-				"tooltip": "Add posts that reblogged the initial results to the dataset. "
-							"Limited to 1,000 reblogs per post.",
+				"tooltip": "Add reblogs to the dataset. "
+							"",
 				"requires": "get_notes==true",
 				"default": False
 			},
@@ -131,25 +133,17 @@ def get_options(cls, parent_dataset=None, user=None):
 				"help": "Reblogs to add",
 				"options": {
 					"text": "Only with added text",
-					"text_or_tag": "Only with added text and/or added hashtags"
+					"text_or_tag": "Only with added text and/or added tags (slow)"
 				},
 				"tooltip": "What type of reblogs to add to the dataset.",
 				"requires": "get_reblogs==true",
 				"default": "text"
 			},
-			"follow_reblogs": {
-				"type": UserInput.OPTION_TOGGLE,
-				"help": "Add posts reblogged by collected posts",
-				"tooltip": "Add all posts that were reblogged by the initial results to the dataset. "
-							"This adds the entire reblog 'trail' from the initial post to the collected post. "
-							"Only affects blog-level search; tag-level search only gets original posts.",
-				"default": False
-				},
 			"reblog_outside_daterange": {
 				"type": UserInput.OPTION_TOGGLE,
-				"help": "Allow reblogs and reblogged posts outside of date range",
-				"tooltip": "Whether to keep reblogs or reblogged posts that fall outside the optional date range limit inserted below.",
-				"default": True
+				"help": "Retain reblogs outside of date range",
+				"requires": "get_reblogs==true",
+				"default": False
 			}
 		}
 
@@ -220,8 +214,6 @@ def get_items(self, query):
 		queries = re.split(",|\n", parameters.get("query", ""))
 		get_notes = parameters.get("get_notes", False)
 		get_reblogs = parameters.get("get_reblogs", False)
-		reblog_crawl_depth = parameters.get("reblog_crawl_depth", 0)
-		follow_reblogs = parameters.get("follow_reblogs", False)
 		reblog_type = parameters.get("reblog_type", False)
 		reblog_outside_daterange = parameters.get("reblog_outside_daterange", False)
 
@@ -235,13 +227,12 @@ def get_items(self, query):
 		# Get date parameters
 		min_date = parameters.get("min_date", None)
 		max_date = parameters.get("max_date", None)
+		min_date = int(min_date) if min_date else 0
+		max_date = int(max_date) if max_date else int(time.time())
 
-		if min_date:
-			min_date = int(min_date)
-		if max_date:
-			max_date = int(max_date)
-		else:
-			max_date = int(time.time())
+		if not queries:
+			self.dataset.finish_with_error("No queries given")
+			return
 
 		# Connect to Tumblr API
 		try:
@@ -256,8 +247,9 @@ def get_items(self, query):
 			self.dataset.finish_with_error(f"Could not connect to Tumblr API: {client_info.get('meta', {}).get('status', '')} - {client_info.get('meta', {}).get('msg', '')}")
 			return
 
-		# for each tag or blog, get post
-		for query in queries:
+		# For each tag or blog, get posts
+		# with a limit of ten individual tasks.
+		for query in queries[:10]:
 
 			query = query.strip()
 
@@ -291,15 +283,17 @@ def get_items(self, query):
 				self.dataset.update_status("API limit reached")
 				break
 
-		# Check for reblogged posts in the reblog trail
-		if follow_reblogs:
+		# Check for reblogged posts in the reblog trail;
+		# we're addingt these if we're adding reblogs.
+		if get_reblogs:
 			for result in results:
 				# The post rail is stored in the trail list
 				for trail_post in result.get("trail", []):
 					# Some posts or blogs have been deleted; skip these
 					if not "broken_blog_name" in trail_post:
-						if trail_post["post_id"] not in self.seen_ids:
-							extra_posts.add({"blog": trail_post["blog"], "id": trail_post["post_id"]})
+						if trail_post["post"]["id"] not in self.seen_ids:
+							extra_posts.append({"blog": trail_post["blog"]["name"],
+												"id": trail_post["post"]["id"]})
 
 		# Get note data.
 		# Blog-level searches already have some note data, like reblogged text,
@@ -332,32 +326,35 @@ def get_items(self, query):
 					# Add some metrics, like done in `get_notes`.
 					notes = {
 						"notes": post["notes"],
-						"reply_count": len([n for n in notes if n["type"] == "reply"]),
-						"reblog_count": len([n for n in notes if n["type"] == "reblog"]),
-						"like_count": len([n for n in notes if n["type"] == "like"])
+						"reply_count": len([n for n in post["notes"] if n["type"] == "reply"]),
+						"reblog_count": len([n for n in post["notes"] if n["type"] == "reblog"]),
+						"like_count": len([n for n in post["notes"] if n["type"] == "like"])
 					}
 
 				else:
 					# Get notes via the API
-					# Only gets first 1,000 replies/text reblogs.
+					# Only gets first 1,000 replies or text/tag reblogs.
 
 					# We're using different querying modes since
 					# it'll speed up the process. The fastest is 
 					# `conversation`, which prioritises text reblogs and
 					# replies, and also provides metrics on like and reblog counts;
-					# we'll use this as default. However, if the user
-					# has indicated they want to add reblogs with hashtags,
-					# we'll also have to use the `reblogs_with_tags` mode.
+					# we'll use this as default. If the user
+					# has indicated they also want to add reblogs with tags,
+					# we'll also use the `reblogs_with_tags` mode.
 					seen_notes = set()
-					notes = self.get_notes(post["blog_name"], post["id"], mode="conversation", max_notes=1000)
+					notes = self.get_notes(post["blog_name"], post["id"], mode="conversation", max_reblogs=self.max_reblogs)
+					reblog_count = 0
 					for note in notes["notes"]:
-						if note["type"] == "reblog":
-							seen_notes.add(note["post_id"])
+						if note["type"] == "reblog" or note["type"] == "reply":
+							if note["type"] == "reblog": # Replies don't have IDs
+								reblog_count += 1
+								seen_notes.add(note["post_id"])
 
 					# Get tag-only reblogs; these aren't returned in `conversation` mode.
-					if reblog_type == "text_or_tag":
-						tag_notes = self.get_notes(post["blog_name"], post["id"], mode="reblogs_with_tags", max_notes=1000)
-						for tag_note in tag_notes:
+					if reblog_type == "text_or_tag" and reblog_count <= self.max_reblogs:
+						tag_notes = self.get_notes(post["blog_name"], post["id"], mode="reblogs_with_tags", max_reblogs=self.max_reblogs - reblog_count)
+						for tag_note in tag_notes["notes"]:
 							if tag_note["post_id"] not in seen_notes:
 								notes["notes"].append(tag_note)
 				
@@ -374,34 +371,19 @@ def get_items(self, query):
 						if note["type"] != "reblog":
 							continue
 
-						elif reblog_type == "text_or_tag":
-							# Skip reblogs without tags or text
-							if not note.get("tags") and not note.get("added_text"):
-								continue
-
-						elif reblog_type == "text":
-							# Skip reblogs without added text
-							if not note.get("added_text"):
-								continue
-
-						# Skip posts that we already collected
-						if note["post_id"] in self.seen_ids:
-							continue
-
-						# Potentially skip new posts outside of the date range
-						if reblog_outside_daterange and (max_date or min_date):
-							if not min_date:
-								if note["timestamp"] >= max_date:
-									continue
-							elif not min_date <= note["timestamp"] <= max_date:
-								continue
-
-						extra_posts.append({"blog": note["blog_name"], "id": note["post_id"]})
+						if note["post_id"] not in self.seen_ids:
 
+							# Potentially skip extra posts outside of the date range
+							if not reblog_outside_daterange:
+								if note.get("timestamp"):
+									if not min_date >= note["timestamp"] >= max_date:
+										continue
+							extra_posts.append({"blog": note["blog_name"], "id": note["post_id"]})
+		
 		# Add reblogged posts and reblogs to dataset
 		for i, extra_post in enumerate(extra_posts):
 			
-			self.dataset.update_status("Adding %s/%s reblogged posts to the dataset" % (i, len(extra_posts)))
+			self.dataset.update_status("Adding %s/%s reblogs to the dataset" % (i, len(extra_posts)))
 
 			if extra_post["id"] not in self.seen_ids:
 				new_post = self.get_posts_by_blog(extra_post["blog"], extra_post["id"])
@@ -409,7 +391,15 @@ def get_items(self, query):
 				if new_post:
 					new_post = new_post[0]
 
-					# Add note data; these should already be retrieved above
+					# Potentially skip new posts outside of the date range
+					# We (also) do this after the API call because a timestamp is
+					# not always present in the notes data.
+					if not reblog_outside_daterange:
+						
+						if not min_date >= new_post["timestamp"] >= max_date:
+							continue
+
+					# Add note data; these are already be retrieved above
 					if get_notes:
 						new_post = {**new_post, **retrieved_notes[new_post["reblog_key"]]}
 
@@ -517,7 +507,7 @@ def get_posts_by_tag(self, tag, max_date=None, min_date=None, api_key=None):
 
 				date_retries += 1
 
-				# We're first gonna check carefully if there's small timegaps by
+				# We're first gonna check carefully if there's small time gaps by
 				# decreasing by six hours.
 				# If that didn't result in any new posts, also dedicate 12 date_retries
 				# with reductions of six months, just to be sure there's no data from
@@ -530,7 +520,7 @@ def get_posts_by_tag(self, tag, max_date=None, min_date=None, api_key=None):
 					self.dataset.update_status("No new posts found for #%s - looking for posts before %s" % (tag, datetime.fromtimestamp(max_date).strftime("%Y-%m-%d %H:%M:%S")))
 
 				# We can stop when the max date drops below the min date.
-				if min_date:
+				if min_date != 0:
 					if max_date <= min_date:
 						break
 
@@ -583,7 +573,7 @@ def get_posts_by_tag(self, tag, max_date=None, min_date=None, api_key=None):
 
 				# Manually check if we have a lower date than the lowest allowed date already (min date).
 				# This functonality is not natively supported by Tumblr.
-				if min_date:
+				if min_date != 0:
 					if max_date < min_date:
 
 						# Get rid of all the posts that are earlier than the max_date timestamp
@@ -692,7 +682,7 @@ def get_posts_by_blog(self, blog, post_id=None, max_date=None, min_date=None):
 			# time reductions. Make sure it's no error string, though.
 			new_posts = []
 			for post in posts:
-				# Sometimes the API repsonds just with "meta", "response", or "errors".
+				# Sometimes the API reponds just with "meta", "response", or "errors".
 				if isinstance(post, str):
 					self.dataset.update_status("Couldn't add post:", post)
 					retries += 1
@@ -705,7 +695,10 @@ def get_posts_by_blog(self, blog, post_id=None, max_date=None, min_date=None):
 
 			posts = new_posts
 
-			if not post_id:
+			if not new_posts:
+				break
+
+			if new_posts and not post_id:
 
 				# Append posts to main list
 				# Get the lowest date
@@ -713,7 +706,7 @@ def get_posts_by_blog(self, blog, post_id=None, max_date=None, min_date=None):
 
 				# Manually check if we have a lower date than the min date (`min_date`) already.
 				# This functonality is not natively supported by Tumblr.
-				if min_date:
+				if min_date != 0:
 					if max_date < min_date:
 
 						# Get rid of all the posts that are earlier than the max_date timestamp
@@ -738,19 +731,18 @@ def get_posts_by_blog(self, blog, post_id=None, max_date=None, min_date=None):
 
 		return all_posts
 
-	def get_notes(self, blog_id, post_id, mode="conversation", max_notes=1000):
+	def get_notes(self, blog_id, post_id, mode="conversation", max_reblogs=1000):
 		"""
 		Gets data on the notes of a specific post.
 		:param blog_id, str: 	The ID of the blog.
 		:param post_id, str:	The ID of the post.
 		:param mode, str:		The type of notes that get priority.
 								`conversation` prioritises text reblogs and replies.
-		:param mode, max_notes:	Maximum amount of notes to return.
+		:param mode, max_reblogs:	Maximum amount of notes to return.
 
 		:returns: a list with dictionaries of notes.
 		"""
-		
-		note_metrics = {}
+
 		post_notes = []
 		max_date = None
 
@@ -761,18 +753,26 @@ def get_notes(self, blog_id, post_id, mode="conversation", max_notes=1000):
 		# so we'll cap this at 100
 
 		# Stop trying to fetch the notes after this many retries
-		max_notes_retries = 10
+		max_reblogs_retries = 10
 		notes_retries = 0
 
 		first_batch = True
 		note_metrics = {}
 
+		stop_collecting = False
+
+		# For status updates
+		if mode == "conversation":
+			note_type = "reblogs with text"
+		elif mode == "reblogs_with_tags":
+			note_type = "reblogs with tags"
+
 		if self.interrupted:
 			raise ProcessorInterruptedException("Interrupted while fetching post notes from Tumblr")
 
 		while True:
 
-			if notes_retries >= max_notes_retries:
+			if notes_retries >= max_reblogs_retries:
 				self.dataset.update_status("Too many connection errors; unable to collect notes for post %s" % post_id)
 				self.failed_posts.append(post_id)
 				break
@@ -801,6 +801,7 @@ def get_notes(self, blog_id, post_id, mode="conversation", max_notes=1000):
 				notes_retries = 0
 
 				# Add some metrics for the first response
+				# These metrics are only returned in conversation mode.
 				if first_batch and mode == "conversation":
 					note_metrics = {
 						"note_count": notes["total_notes"],
@@ -812,27 +813,33 @@ def get_notes(self, blog_id, post_id, mode="conversation", max_notes=1000):
 
 				# Add notes
 				for note in notes["notes"]:
+
+					# Only count reblogs with added content (text or hashtags)
+					# towards the total count; replies are never too substantial,
+					# so we always collect them all.
 					if mode == "conversation" and note["type"] == "reply":
 						note_metrics["reply_count"] += 1
+					elif mode == "conversation":
+						count += 1
+					elif mode == "reblogs_with_tags":
+						# Skip notes without added tags
+						if not note.get("tags"):
+							continue
+						count += 1
 
-					count += 1
 					post_notes.append(note)
+					
+					if count >= max_reblogs:
+						post_notes = post_notes[:count + note_metrics.get("reply_count", 0)]
+						stop_collecting = True
 
-				# `conversation` mode groups likes and reblogs without commentary
-				# in the `rollup_notes` list. We're adding reblogs to the post notes.
-				if mode == "conversation":
-					if "rollup_notes" in notes and "notes" in notes["rollup_notes"][0]:
-						for note in notes["rollup_notes"][0]["notes"]:
-							if note["type"] == "reblog":
-								count += 1
-								post_notes.append(note)
-
-				if count >= max_notes:
+				if stop_collecting:
 					break
 
 				if notes.get("_links"):
 					max_date = notes["_links"]["next"]["query_params"]["before_timestamp"]
-					self.dataset.update_status("Added %s notes for post %s:%s" % (count, blog_id, post_id))
+					
+					self.dataset.update_status("Collected %s %s for @%s:%s" % (count, note_type, blog_id, post_id))
 					time.sleep(.2)
 
 				# If there's no `_links` key, that's all.
@@ -918,7 +925,7 @@ def validate_query(query, request, user):
 			raise QueryParametersException("Search query cannot be empty.")
 
 		# So it shows nicely in the frontend.
-		items = ", ".join([item.lstrip().rstrip() for item in items if item])
+		items = ", ".join([item.strip() for item in items if item])
 
 		# the dates need to make sense as a range to search within
 		query["min_date"], query["max_date"] = query.get("daterange")
@@ -945,6 +952,7 @@ def map_item(post):
 
 		media_types = ["photo", "video", "audio"]
 		image_urls = []
+		image_urls_reblogged = []
 		video_urls = []
 		video_thumb_urls = []
 		audio_urls = []
@@ -1055,8 +1063,8 @@ def map_item(post):
 				replies.insert(0, note["reply_text"])
 
 		# The API sometimes gives back a 'trail' of reblogged content
-		# This includes reblogged content, but it's not entirely complete (e.g. no hashtags)
-		# so we'll only store the original blog name and its text content.
+		# This includes reblogged content, but it's not entirely complete (e.g. no tags)
+		# so we'll only store the original blog name and its text + image content.
 		for i, reblog in enumerate(post.get("trail", [])):
 			
 			reblogged_text = []
@@ -1069,6 +1077,8 @@ def map_item(post):
 			for reblog_block in reblog.get("content", []):
 				if reblog_block["type"] == "text":
 					reblogged_text.append(reblog_block["text"])
+				if reblog_block["type"] == "image":
+					image_urls_reblogged.append(reblog_block["media"][0]["url"])
 
 			if not reblogged_text:
 				reblogged_text = ""
@@ -1082,7 +1092,8 @@ def map_item(post):
 			"author": post["blog_name"],
 			"author_avatar_url": "https://api.tumblr.com/v2/blog/" + post["blog_name"] + "/avatar",
 			"thread_id": post["reblog_key"],
-			"timestamp": post["timestamp"],
+			"timestamp": datetime.fromtimestamp(post["timestamp"]).strftime("%Y-%m-%d %H:%M:%S"),
+            "unix_timestamp": post["timestamp"],
 			"author_subject": post["blog"]["title"],
 			"author_description": strip_tags(post["blog"]["description"]),
 			"author_url": post["blog"]["url"],
@@ -1096,6 +1107,7 @@ def map_item(post):
 			"body_markdown": "\n".join(formatted_text),
 			"body_reblogged": "\n\n".join(body_reblogged),
 			"reblog_trail": ",".join(reblog_trail),
+			"parent_post_author": post.get("reblogged_from_name", ""),
 			"parent_post_url": post.get("parent_post_url", ""),
 			"body_ask": "\n".join(body_ask),
 			"author_ask": author_ask,
@@ -1111,6 +1123,7 @@ def map_item(post):
 			"link_titles": "\n".join(link_titles),
 			"link_descriptions": "\n".join(link_descriptions),
 			"image_urls": ",".join(image_urls),
+			"image_urls_reblogged": ",".join(image_urls_reblogged),
 			"video_urls": ",".join(video_urls),
 			"video_thumb_urls": ",".join(video_thumb_urls),
 			"audio_urls": ",".join(audio_urls),
diff --git a/webtool/lib/template_filters.py b/webtool/lib/template_filters.py
index f7bfc23a2..f92bdfbd7 100644
--- a/webtool/lib/template_filters.py
+++ b/webtool/lib/template_filters.py
@@ -202,7 +202,8 @@ def _jinja2_filter_social_mediafy(body, datasource=""):
 		},
 		"tumblr": {
 			"mention": "https://tumblr.com/",
-			"markdown": True # Hashtags aren't linked in the post body
+			"markdown": True
+			# Hashtags aren't linked in the post body
 		},
 		"linkedin": {
 			"hashtag": "https://linkedin.com/feed/hashtag/?keywords=",
diff --git a/webtool/static/css/explorer/tumblr.css b/webtool/static/css/explorer/tumblr.css
index baeab6eac..9ef57cbcb 100644
--- a/webtool/static/css/explorer/tumblr.css
+++ b/webtool/static/css/explorer/tumblr.css
@@ -18,13 +18,54 @@
 	max-width: 540px;
 }
 
+/* Author info  */
+.author-row {
+	display: flex;
+	align-items: center;
+	padding: 19px;
+	font-size: 13px;
+	text-decoration: none;
+	color: #5e5e5e;
+	overflow: hidden;
+}
+
 .author {
 	font-size: 13px;
 	font-weight: bold;
 }
 
+.author.pseudonymous {
+	width: 32px;
+	height: 32px;
+	border-radius: 3px;
+	color: white;
+	background-color: #2f4b66;
+	text-align: center;
+	vertical-align: middle;
+}
+
+.author-row .author, .author-row a {
+	color: black;
+}
+
+.author-row .author-avatar {
+	display: inline-block;
+}
+
+.author-row .author {
+	display: inline-block;
+}
+
+.author.pseudonymous i {
+	line-height: 32px;
+	color: white;
+}
+
 .author-avatar {
 	width: 32px;
+}
+
+.author-avatar:not(.reblog) {
 	margin-right: 10px;
 }
 
@@ -33,26 +74,27 @@
 	width: 100%;
 }
 
-/* Main author info  */
-header {
-	display: flex;
-	align-items: center;
-	padding: 19px;
-	text-decoration: none;
-	color: black;
-	overflow: hidden;
+.author-row.reblog {
+	border-bottom: 1px solid rgba(0,0,0,0.13);
 }
 
-header a {
-	color: black;
+.reblog-notice {
+	padding-left: 3px;
+	padding-right: 3px;
 }
 
-header .author-avatar {
-	display: inline-block;
+.reblog-icon {
+	height: 32px;
 }
 
-header .author {
-	display: inline-block;
+.reblog-icon i {
+	background-color: #00cf35;
+	border-radius: 100%;
+	color: white;
+	font-size: 8px;
+	padding: 4px;
+	margin-top: 20px;
+	margin-left: -7px;
 }
 
 /* Media */
@@ -99,16 +141,16 @@ header .author {
 	padding: 0px 19px 0px 19px;
 }
 
-.post-content .body, .body-reblogged {
-	white-space: pre-wrap;
+.post-content.reblog {
+	padding-bottom: 19px;
 	line-height: 1.5em;
+	border-bottom: 1px solid rgba(0,0,0,0.13);
 }
 
 .post-content .body {
-	padding: 3px 0px 3px 0px;
+	padding: 10px 0px 10px 0px;
 }
 
-
 .post-content h1 {
 	font-size: 20px;
 	font-weight: bold;
@@ -183,9 +225,13 @@ a.embedded-link:hover {
 }
 
 .tags {
-	padding-top: 5px;
+	display: flex;
+    align-items: center;
+    padding: 19px 0px 19px 0px;
+    font-size: 15px;
+    text-decoration: none;
 	list-style-type: none;
-	color: #5e5e5e;
+    color: #5e5e5e;
 	word-break: break-all;
 }
 
@@ -201,7 +247,7 @@ a.embedded-link:hover {
 
 /* Post footer */
 footer {
-	margin: 19px;
+	margin: 0px 19px 19px 19px;
 	padding-top: 19px;
 	border-top: 1px solid rgba(0,0,0,0.13);
 }
diff --git a/webtool/templates/explorer/datasource-templates/tumblr.html b/webtool/templates/explorer/datasource-templates/tumblr.html
index 3866f3c70..972766273 100644
--- a/webtool/templates/explorer/datasource-templates/tumblr.html
+++ b/webtool/templates/explorer/datasource-templates/tumblr.html
@@ -1,37 +1,85 @@
+{% set reblog = True if post.parent_post_author else False %}
 <header>
+	<div class="author-row {% if reblog %}reblog{% endif %}">
 	{% if not pseudonymised %}
 		<!-- Possible external link, if not pseudonymised -->
 		<a href="{{ post['post_url'] }}" target="_blank"><span class="external-url" title="Go to original post"><i class="fas fa-external-link-alt"></i></span></a>
-		<!-- PF -->
+		<!-- PFP -->
 		{% if post["author_avatar_url"] %}
-		<div class="author-avatar">
+		<div class="author-avatar {% if reblog %}reblog{% endif %}">
 			<a href="{{ post.get('author_url') }}" target="_blank">
 				<img src="{{ post.get('author_avatar_url') }}">
 			</a>
 		</div>
 		{% endif %}
 		<!-- AUTHOR -->
-		<span class="author"><a href="{{ post.get('author_url') }}">{{ post.get("author") }}</a>
+		{% if reblog %}
+			<span class="reblog-icon"><i class="fa-solid fa-repeat"></i></span>
+		{% endif %}
+		<span class="author"><a href="{{ post.get('author_url') }}">{{ post.get("author") }}</a></span>
+		{% if reblog %}<span class="reblog-notice"> reblogged </span><a href="https://tumblr.com/{{ post.parent_post_author }}"><span class="author">{{ post.parent_post_author }}</span></a>{% endif %}
 		
 	{% else %}
-		<span title="Pseudonymous author" class="author">
+		<span title="Pseudonymous author" class="author pseudonymous">
 		<i class="fa fa-user-secret tooltip-trigger"></i>
+		{% if post.parent_post_author %}</span><span class="reblog-icon"><i class="fa-solid fa-repeat"></i></span> reblogged{% endif %}
 	{% endif %}
+	</div>
 
 </header>
 
-
 <!-- REBLOGGED CONTENT -->
-{% if post.author_trail %}
+{% if reblog %}
 
-{% for reblog_author in post.author_trail.split(",") %}
-<div class="post-content reblog">
-	{{ reblog_author }}
-	<p class="body reblog-body">
-		{{ post.body_reblogged.split("\n\n")[loop.index - 1] }}
-	</p>
-</div>
-{% endfor %}
+	{% for reblog_author in post.reblog_trail.split(",") %}
+	<div class="author-row">
+		{% if not pseudonymised %}
+			<div class="author-avatar">
+				<a href="https://tumblr.com/{{ reblog_author }}" target="_blank">
+					<img src="https://api.tumblr.com/v2/blog/{{ reblog_author }}/avatar">
+				</a>
+			</div>
+			<span class="author"><a href="https://tumblr.com/{{ reblog_author }}">{{ reblog_author }}</a></span>
+		{% else %}
+			<span class="author pseudonymous" title="Pseudonymous author"><i class="fa fa-user-secret tooltip-trigger"></i></span>
+		{% endif %}
+	</div>
+	<div class="post-content reblog">
+		{% if post.get("image_urls_reblogged") %}
+			{% for image_url in post.image_urls_reblogged.split(",") %}
+			<div class="media-container image">
+				<img src="{{ image_url }}">
+			</div>
+			{% endfor %}
+		{% endif %}
+		<div class="body reblog-body">
+			{{ post.body_reblogged.split("\n\n")[loop.index - 1] }}
+		</div>
+	</div>
+	{% endfor %}
+
+	{% if post.body %}
+		<div class="author-row">
+		{% if not pseudonymised %}
+			<!-- Possible external link, if not pseudonymised -->
+			<a href="{{ post['post_url'] }}" target="_blank"><span class="external-url" title="Go to original post"><i class="fas fa-external-link-alt"></i></span></a>
+			<!-- PFP -->
+			{% if post["author_avatar_url"] %}
+			<div class="author-avatar">
+				<a href="{{ post.get('author_url') }}" target="_blank">
+					<img src="{{ post.get('author_avatar_url') }}">
+				</a>
+			</div>
+			{% endif %}
+			<!-- AUTHOR -->
+			<span class="author"><a href="{{ post.get('author_url') }}">{{ post.get("author") }}</a></span>
+			
+		{% else %}
+			<span title="Pseudonymous author" class="author pseudonymous">
+			<i class="fa fa-user-secret tooltip-trigger"></i>
+		{% endif %}
+		</div>
+	{% endif %}
 
 {% endif %}
 
@@ -44,7 +92,7 @@
 	{% for block in content_order %}
 		{% if block == "text" %}
 		<!-- TEXT -->
-			<p class="body">{{ post.get("body_markdown").split("\n")[block_counts.text] | markdown | social_mediafy(datasource='tumblr') | safe }}</p>
+			<div class="body">{{ post.get("body_markdown").split("\n")[block_counts.text] | markdown | social_mediafy(datasource='tumblr') | safe }}</div>
 			{% set block_counts.text = block_counts.text + 1 %}
 		{% elif block == "image" %}
 		<!-- IMAGE -->
@@ -128,24 +176,24 @@
 <footer>
 
 	<!-- DATE -->
-	<div class="time">{{ post.timestamp | datetime(fmt="%d %b %Y, %H:%M", wrap=False) }} UTC</div>
+	<div class="time">{{ post.unix_timestamp | datetime(fmt="%d %b %Y, %H:%M", wrap=False) }} UTC</div>
 
 	<!-- NOTES -->
-	{% if post.notes %} 
+	{% if post.note_count %} 
 	<div class="notes">
 		<div class="note-counts">
-			<span class="note-count total">{{ post.get("notes") | commafy }} note{% if post.get("notes", 0) > 1 %}s{% endif %}</span>
+			<span class="note-count total">{{ post.get("note_count") | commafy }} note{% if post.get("note_count", 0) > 1 %}s{% endif %}</span>
 			<!-- REBLOGS -->
 			{% if post.get("reblog_count") %}
-				<span class="note-count reblog-count"><i class="fa-solid fa-retweet"></i> {{ post.reblog_count }}</span>
+				<span class="note-count reblog-count"><i class="fa-solid fa-retweet"></i> {{ post.reblog_count | commafy }}</span>
 			{% endif %}
 			<!-- LIKES -->
 			{% if post.get("like_count") %}
-				<span class="note-count like-count"><i class="fa-solid fa-heart"></i> {{ post.get("like_count") }}</span>
+				<span class="note-count like-count"><i class="fa-solid fa-heart"></i> {{ post.get("like_count") | commafy }}</span>
 			{% endif %}
 			<!-- REPLIES -->
 			{% if post.get("reply_count") %}
-				<span class="note-count reply-count"><i class="fa-solid fa-reply"></i> {{ post.get("reply_count") }}</span>
+				<span class="note-count reply-count"><i class="fa-solid fa-reply"></i> {{ post.get("reply_count") | commafy }}</span>
 			{% endif %}
 		</div>
 		{% if post.get("authors_replied") %}

From c8f204e1ef65b8bc995a03f32126f93df0b8c46a Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Wed, 31 Jul 2024 13:02:13 +0200
Subject: [PATCH 112/204] Improve and fix revamped Tumblr search

---
 datasources/tumblr/search_tumblr.py           | 112 ++++++++++--------
 webtool/static/css/explorer/tumblr.css        |   1 +
 .../explorer/datasource-templates/tumblr.html |   3 +
 3 files changed, 69 insertions(+), 47 deletions(-)

diff --git a/datasources/tumblr/search_tumblr.py b/datasources/tumblr/search_tumblr.py
index dccd4d4da..efd932dc5 100644
--- a/datasources/tumblr/search_tumblr.py
+++ b/datasources/tumblr/search_tumblr.py
@@ -12,6 +12,7 @@
 import pytumblr
 import requests
 import re
+import json
 from requests.exceptions import ConnectionError
 from datetime import datetime
 from ural import urls_from_text
@@ -96,18 +97,18 @@ def get_options(cls, parent_dataset=None, user=None):
 				"type": UserInput.OPTION_INFO,
 				"help": "Retrieve any kind of Tumblr posts with specific tags or from specific blogs. Gets 100.000 posts "
 						"at max. You may insert up to ten tags or blogs.\n\n"
-						"Blog-level search also returns reblogs. *Tag-level search only returns original posts*. "
-						"Reblogs of tagged posts can be retrieved via the options below.\n\n"
+						"*Tag-level search only returns original posts*. "
+						"Reblogs of tagged posts can be retrieved via the options below. Blog-level search also returns reblogs.\n\n"
 						"Tag search only get posts with the exact tag you insert. Querying "
-						"`gogh` will thus not get posts only tagged with `van gogh`.\n\n"
+						"`gogh` will not get posts tagged with `van gogh`.\n\n"
 						"A `#` before a tag is optional. Blog names must start with `@`.\n\n"
-						"Individual posts can be retrieved through the format `@blogname:post_id`.\n\n"
+						"Individual posts can be captured by inserting their URL or via the format `@blogname:post_id`.\n\n"
 						"Keyword search is not allowed by the [Tumblr API](https://api.tumblr.com).\n\n"
 						"If this 4CAT reached its Tumblr API rate limit, try again 24 hours later."
 			},
 			"query": {
 				"type": UserInput.OPTION_TEXT_LARGE,
-				"help": "Tags and/or blogs",
+				"help": "Tags, blogs, or post URLs. Seperate with comma or newline.",
 				"tooltip": "E.g. #research tools, @4catblog, @4catblog:12347714095"
 			},
 			"get_notes": {
@@ -188,8 +189,9 @@ def get_options(cls, parent_dataset=None, user=None):
 			}
 		options["date-intro"] = {
 				"type": UserInput.OPTION_INFO,
-				"help": "**Note:** The [Tumblr API](https://api.tumblr.com) is volatile: when fetching sporadically used "
-						"tags, it may return zero posts, even though older posts exist. To mitigate this, 4CAT decreases "
+				"help": "**Note:** The [Tumblr API](https://api.tumblr.com) is very volatile. Queries may not return "
+						"posts, even if posts exists. Waiting for a while and querying again can help, even with identical queries.\n\n"
+						"Additionally, older tagged posts may not be returned, even if they exist. To mitigate this, 4CAT decreases "
 						"the date parameter (<code>before</code>) with six hours and sends the query again. This often "
 						"successfully returns older, un-fetched posts. If it didn't find new data after 96 retries (24 "
 						"days), it checks for data up to six years before the last date, decreasing 12 times by 6 months. "
@@ -253,13 +255,42 @@ def get_items(self, query):
 
 			query = query.strip()
 
+			post_id = None
+
+			# Format @blogname:id
 			if query.startswith("@"):
-				blog_name = query[1:]
 
 				# Get a possible post ID
-				post_id = None
+				blog_name = query[1:]
 				if ":" in query:
 					blog_name, post_id = blog_name.split(":")
+
+				new_results = self.get_posts_by_blog(blog_name, post_id=post_id, max_date=max_date, min_date=min_date)
+
+			# Post URL
+			elif "tumblr.com/" in query:
+					
+				try:
+					# Format https://{blogname}.tumblr.com/post/{post_id}
+					if "/post/" in query:
+						blog_name = query.split(".tumblr.com")[0].replace("https://", "").replace("www.", "").strip()
+						post_id = query.split("/")[-1].strip()
+						# May also be a slug string..
+						if not post_id.isdigit():
+							post_id = query.split("/")[-2].strip()
+
+					# Format https://tumblr.com/{blogname}/{post_id}
+					else:
+						blog_and_id = query.split("tumblr.com/")[-1]
+						blog_and_id = blog_and_id.replace("blog/view/", "") # Sometimes present in the URL
+						blog_name, post_id = blog_and_id.split("/")
+						if not post_id.isdigit():
+							post_id = query.split("/")[-2].strip()
+
+				except IndexError:
+					self.dataset.update_status("Invalid post URL: %s" % query)
+					continue
+
 				new_results = self.get_posts_by_blog(blog_name, post_id=post_id, max_date=max_date, min_date=min_date)
 
 			# Get tagged post
@@ -284,7 +315,7 @@ def get_items(self, query):
 				break
 
 		# Check for reblogged posts in the reblog trail;
-		# we're addingt these if we're adding reblogs.
+		# we're storing their post IDs and blog names for later, if we're adding reblogs.
 		if get_reblogs:
 			for result in results:
 				# The post rail is stored in the trail list
@@ -322,7 +353,7 @@ def get_items(self, query):
 
 				# In the case of posts with just a few notes,
 				# we may have all the possible notes in the retrieved JSON.
-				elif len(post["notes"]) == post["note_count"]:
+				elif "notes" in post and (len(post["notes"]) == post["note_count"]):
 					# Add some metrics, like done in `get_notes`.
 					notes = {
 						"notes": post["notes"],
@@ -346,10 +377,9 @@ def get_items(self, query):
 					notes = self.get_notes(post["blog_name"], post["id"], mode="conversation", max_reblogs=self.max_reblogs)
 					reblog_count = 0
 					for note in notes["notes"]:
-						if note["type"] == "reblog" or note["type"] == "reply":
-							if note["type"] == "reblog": # Replies don't have IDs
-								reblog_count += 1
-								seen_notes.add(note["post_id"])
+						if note["type"] == "reblog": # Replies don't have IDs
+							reblog_count += 1
+							seen_notes.add(note["post_id"])
 
 					# Get tag-only reblogs; these aren't returned in `conversation` mode.
 					if reblog_type == "text_or_tag" and reblog_count <= self.max_reblogs:
@@ -362,7 +392,7 @@ def get_items(self, query):
 				results[i] = {**results[i], **notes}
 				retrieved_notes[post["reblog_key"]] = notes
 				
-				# Get the full data for certain reblogs and add them as new posts
+				# Identify which notes/reblogs we can collect as new posts
 				if get_reblogs:
 
 					for note in notes["notes"]:
@@ -378,6 +408,7 @@ def get_items(self, query):
 								if note.get("timestamp"):
 									if not min_date >= note["timestamp"] >= max_date:
 										continue
+
 							extra_posts.append({"blog": note["blog_name"], "id": note["post_id"]})
 		
 		# Add reblogged posts and reblogs to dataset
@@ -386,19 +417,17 @@ def get_items(self, query):
 			self.dataset.update_status("Adding %s/%s reblogs to the dataset" % (i, len(extra_posts)))
 
 			if extra_post["id"] not in self.seen_ids:
-				new_post = self.get_posts_by_blog(extra_post["blog"], extra_post["id"])
+				
+				# Potentially skip new posts outside of the date range
+				# not always present in the notes data.
+				if not reblog_outside_daterange and (max_date and min_date):
+					new_post = self.get_posts_by_blog(extra_post["blog"], extra_post["id"], max_date=max_date, min_date=min_date)		
+				else:
+					new_post = self.get_posts_by_blog(extra_post["blog"], extra_post["id"])
 
 				if new_post:
 					new_post = new_post[0]
-
-					# Potentially skip new posts outside of the date range
-					# We (also) do this after the API call because a timestamp is
-					# not always present in the notes data.
-					if not reblog_outside_daterange:
 						
-						if not min_date >= new_post["timestamp"] >= max_date:
-							continue
-
 					# Add note data; these are already be retrieved above
 					if get_notes:
 						new_post = {**new_post, **retrieved_notes[new_post["reblog_key"]]}
@@ -693,33 +722,21 @@ def get_posts_by_blog(self, blog, post_id=None, max_date=None, min_date=None):
 						self.seen_ids.add(post["id"])
 						new_posts.append(post)
 
-			posts = new_posts
-
+			# Possibly only keep posts within the date range.
+			if max_date and min_date:
+				new_posts = [p for p in new_posts if min_date <= p["timestamp"] <= max_date]
+	
 			if not new_posts:
 				break
 
-			if new_posts and not post_id:
-
-				# Append posts to main list
-				# Get the lowest date
-				max_date = sorted([post["timestamp"] for post in posts])[0]
-
-				# Manually check if we have a lower date than the min date (`min_date`) already.
-				# This functonality is not natively supported by Tumblr.
-				if min_date != 0:
-					if max_date < min_date:
-
-						# Get rid of all the posts that are earlier than the max_date timestamp
-						posts = [post for post in posts if post["timestamp"] >= min_date]
+			# Append posts to main list
+			all_posts += new_posts
 
-						if posts:
-							all_posts += posts
-						break
+			# Get the lowest date for next loop
+			max_date = sorted([post["timestamp"] for post in posts])[0]
 
 			retries = 0
 
-			all_posts += posts
-
 			if len(all_posts) >= self.max_posts:
 				self.max_posts_reached = True
 				break
@@ -798,6 +815,7 @@ def get_notes(self, blog_id, post_id, mode="conversation", max_reblogs=1000):
 				break
 
 			if "notes" in notes:
+
 				notes_retries = 0
 
 				# Add some metrics for the first response
@@ -805,8 +823,8 @@ def get_notes(self, blog_id, post_id, mode="conversation", max_reblogs=1000):
 				if first_batch and mode == "conversation":
 					note_metrics = {
 						"note_count": notes["total_notes"],
-						"reblog_count": notes["total_reblogs"],
-						"like_count": notes["total_likes"],
+						"reblog_count": notes.get("total_reblogs", 0),
+						"like_count": notes.get("total_likes", 0),
 						"reply_count": 0
 					}
 					first_batch = False
diff --git a/webtool/static/css/explorer/tumblr.css b/webtool/static/css/explorer/tumblr.css
index 9ef57cbcb..d792a915f 100644
--- a/webtool/static/css/explorer/tumblr.css
+++ b/webtool/static/css/explorer/tumblr.css
@@ -35,6 +35,7 @@
 }
 
 .author.pseudonymous {
+	display: inline-block;
 	width: 32px;
 	height: 32px;
 	border-radius: 3px;
diff --git a/webtool/templates/explorer/datasource-templates/tumblr.html b/webtool/templates/explorer/datasource-templates/tumblr.html
index 972766273..65ad402ac 100644
--- a/webtool/templates/explorer/datasource-templates/tumblr.html
+++ b/webtool/templates/explorer/datasource-templates/tumblr.html
@@ -203,6 +203,9 @@
 				<div class="author-avatar author-replied-avatar">
 					{% if not pseudonymised %}
 						<img src="https://api.tumblr.com/v2/blog/{{ author_replied }}/avatar">
+					{% else %}
+						<span title="Pseudonymous author" class="author pseudonymous">
+						<i class="fa fa-user-secret tooltip-trigger"></i>
 					{% endif %}
 				</div>
 				<div class="reply-content">

From da62b83b08b208cfcbeeddf4bc7a2d719fec5374 Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Wed, 31 Jul 2024 14:08:54 +0200
Subject: [PATCH 113/204] Some more warnings in the Tumblr search info

---
 datasources/tumblr/search_tumblr.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/datasources/tumblr/search_tumblr.py b/datasources/tumblr/search_tumblr.py
index efd932dc5..0f696507b 100644
--- a/datasources/tumblr/search_tumblr.py
+++ b/datasources/tumblr/search_tumblr.py
@@ -108,8 +108,8 @@ def get_options(cls, parent_dataset=None, user=None):
 			},
 			"query": {
 				"type": UserInput.OPTION_TEXT_LARGE,
-				"help": "Tags, blogs, or post URLs. Seperate with comma or newline.",
-				"tooltip": "E.g. #research tools, @4catblog, @4catblog:12347714095"
+				"help": "Tags, blogs, or post URLs.",
+				"tooltip": " Seperate with comma or newline. Example:\n#research tools, @4catblog, https://tumblr.com/4catblog/12347714095"
 			},
 			"get_notes": {
 				"type": UserInput.OPTION_TOGGLE,
@@ -190,7 +190,8 @@ def get_options(cls, parent_dataset=None, user=None):
 		options["date-intro"] = {
 				"type": UserInput.OPTION_INFO,
 				"help": "**Note:** The [Tumblr API](https://api.tumblr.com) is very volatile. Queries may not return "
-						"posts, even if posts exists. Waiting for a while and querying again can help, even with identical queries.\n\n"
+						"posts, even if posts exists. Waiting for a while and querying again can help, even with identical queries. "
+						"Consider carrying out multiple queries and using the 'Merge datasets' processor to limit false negatives.\n\n"
 						"Additionally, older tagged posts may not be returned, even if they exist. To mitigate this, 4CAT decreases "
 						"the date parameter (<code>before</code>) with six hours and sends the query again. This often "
 						"successfully returns older, un-fetched posts. If it didn't find new data after 96 retries (24 "

From 0e739b327d8e1e6e1141830630e5e038d65f8843 Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Wed, 31 Jul 2024 17:34:11 +0200
Subject: [PATCH 114/204] Migrate script for expanded annotation table

---
 backend/database.sql                        |  36 ++++-
 helper-scripts/migrate/migrate-1.45-1.46.py | 155 ++++++++++++++++++++
 2 files changed, 187 insertions(+), 4 deletions(-)
 create mode 100644 helper-scripts/migrate/migrate-1.45-1.46.py

diff --git a/backend/database.sql b/backend/database.sql
index 33f0ea393..f51df7e5d 100644
--- a/backend/database.sql
+++ b/backend/database.sql
@@ -67,11 +67,39 @@ CREATE TABLE datasets_owners (
 
 CREATE UNIQUE INDEX datasets_owners_user_key_idx ON datasets_owners("name" text_ops,key text_ops);
 
-
 -- annotations
-CREATE TABLE IF NOT EXISTS annotations (
-  key               text UNIQUE PRIMARY KEY,
-  annotations       text DEFAULT ''
+CREATE TABLE IF NOT EXISTS annotations_new (
+  id                SERIAL PRIMARY KEY,
+  post_id           TEXT,
+  dataset           TEXT,
+  timestamp         INT DEFAULT 0,
+  timestamp_created INT DEFAULT 0,
+  label             TEXT,
+  type              TEXT,
+  options           TEXT,
+  value             TEXT,
+  author            TEXT,
+  is_processor      BOOLEAN DEFAULT FALSE,
+  metadata          TEXT
+);
+
+CREATE UNIQUE INDEX IF NOT EXISTS annotation_id
+  ON annotations_new (
+    id
+);
+CREATE UNIQUE INDEX IF NOT EXISTS annotation_unique
+  ON annotations_new (
+    label,
+    dataset,
+    post_id
+);
+CREATE INDEX IF NOT EXISTS annotation_value
+  ON annotations_new (
+    value
+);
+CREATE INDEX IF NOT EXISTS annotation_timestamp
+  ON annotations_new (
+    timestamp
 );
 
 -- metrics
diff --git a/helper-scripts/migrate/migrate-1.45-1.46.py b/helper-scripts/migrate/migrate-1.45-1.46.py
new file mode 100644
index 000000000..2246d0ea2
--- /dev/null
+++ b/helper-scripts/migrate/migrate-1.45-1.46.py
@@ -0,0 +1,155 @@
+# Update the 'annotations' table so every annotation has its own row.
+# also add extra data
+import sys
+import os
+import json
+
+from pathlib import Path
+
+sys.path.insert(0, os.path.join(os.path.abspath(os.path.dirname(__file__)), "../.."))
+from common.lib.database import Database
+from common.lib.logger import Logger
+
+log = Logger(output=True)
+
+import configparser
+
+ini = configparser.ConfigParser()
+ini.read(Path(__file__).parent.parent.parent.resolve().joinpath("config/config.ini"))
+db_config = ini["DATABASE"]
+
+db = Database(logger=log, dbname=db_config["db_name"], user=db_config["db_user"], password=db_config["db_password"],
+              host=db_config["db_host"], port=db_config["db_port"], appname="4cat-migrate")
+
+print("  Creating new annotations table...")
+db.execute("""
+CREATE TABLE IF NOT EXISTS annotations_new (
+  id                SERIAL PRIMARY KEY,
+  field_id          SERIAL,
+  post_id           TEXT,
+  dataset           TEXT,
+  timestamp         INT DEFAULT 0,
+  timestamp_created INT DEFAULT 0,
+  label             TEXT,
+  type              TEXT,
+  options           TEXT,
+  value             TEXT,
+  author            TEXT,
+  is_processor      BOOLEAN DEFAULT FALSE,
+  metadata          TEXT
+);
+""")
+
+print("  Creating indexes for annotations table...")
+db.execute("""
+CREATE UNIQUE INDEX IF NOT EXISTS annotation_id
+  ON annotations_new (
+    id
+);
+CREATE UNIQUE INDEX IF NOT EXISTS annotation_unique
+  ON annotations_new (
+    label,
+    dataset,
+    post_id
+);
+CREATE INDEX IF NOT EXISTS annotation_value
+  ON annotations_new (
+    value
+);
+CREATE INDEX IF NOT EXISTS annotation_timestamp
+  ON annotations_new (
+    timestamp
+);
+""")
+
+print("  Transferring old annotations to new annotations table...")
+
+annotations = db.fetchall("SELECT * FROM annotations;")
+
+if not annotations:
+    print("  No annotation fields to transfer, skipping...")
+
+else:
+    print("  Transferring annotations")
+    
+    count = 0
+    skipped_count = 0
+
+    columns = "post_id,field_id,dataset,timestamp,timestamp_created,label,type,options,value,author,is_processor,metadata"
+
+    # Each row are **all** annotations per dataset
+    for row in annotations:
+        
+        if not row.get("annotations"):
+            print("    No annotations for dataset %s, skipping..." % row["key"])
+            skipped_count += 1
+            continue
+
+        dataset = db.fetchone("SELECT * FROM datasets WHERE key = '" + row["key"] + "';")
+        
+        # If the dataset is not present anymore,
+        # we're going to skip these annotations;
+        # likely the dataset is expired.
+        if not dataset:
+            print("    No dataset found for key %s, skipping..." % row["key"])
+            skipped_count += 1
+            continue
+
+        annotation_fields = json.loads(dataset["annotation_fields"])
+        author = dataset.get("creator", "")
+
+        # Loop through all annotated posts
+        for post_id, post_annotations in json.loads(row["annotations"]).items():
+
+            # Loop through individual annotations per post
+            for label, value in post_annotations.items():
+
+                # Get the ID of this particular annotation field
+                field_id = [k for k, v in annotation_fields.items() if v["label"] == label]
+
+                if field_id:
+                    field_id = field_id[0]
+                    
+                # Skip if this field was not saved to the datasets table
+                if not field_id or field_id not in annotation_fields:
+                    print("    Annotation field ID not saved to datasets table, skipping...")
+                    skipped_count += 1
+                    continue
+
+                ann_type = annotation_fields[field_id]["type"]
+                options = annotation_fields[field_id]["options"] if "options" in annotation_fields[field_id] else ""
+                options = {k: v for d in options for k, v in d.items()} # flatten
+
+                if isinstance(value, list):
+                    value = ",".join(value)
+
+                inserts = [(
+                    str(post_id),           # post_id; needs to be a string, changes per data source.
+                    int(field_id),          # field_id; this is an ID for the same type of input field.
+                    row["key"],             # dataset
+                    dataset["timestamp"],   # timestamp
+                    dataset["timestamp"],   # timestamp_created
+                    label,                  # label
+                    ann_type,               # type
+                    json.dumps(options) if options else "",    # options; each option has a key and a value.
+                    value,                  # value
+                    author,                 # author
+                    False,                  # is_processor
+                    json.dumps({}),         # metadata
+                )]
+
+                db.execute("INSERT INTO annotations_new (" + columns + ") VALUES %s", replacements=inserts)
+
+                count += 1
+
+        if count % 10 == 0:
+            print("    Transferred %s annotations..." % count)
+        
+print("  Done, transferred %s annotations and skipped %s annotations" % (count, skipped_count))
+print("  Deleting old annotations table...")
+db.execute("DROP TABLE annotations")
+
+print("  Renaming new annotations table...")
+db.execute("ALTER TABLE annotations_new RENAME TO annotations;")
+
+print("  - done!")
\ No newline at end of file

From 3c524c110aab89328539ed0c7a62cac5d547ba63 Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Wed, 31 Jul 2024 17:39:41 +0200
Subject: [PATCH 115/204] Get annotations per row

---
 common/lib/dataset.py | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/common/lib/dataset.py b/common/lib/dataset.py
index 656f1d007..1d90111c7 100644
--- a/common/lib/dataset.py
+++ b/common/lib/dataset.py
@@ -850,15 +850,10 @@ def get_annotation_fields(self):
 	def get_annotations(self):
 		"""
 		Retrieves the annotations for this dataset.
-		return dict: The annotations
+		return list: All annotations, each in their own dictionary.
 		"""
 
-		annotations = self.db.fetchone("SELECT annotations FROM annotations WHERE key = %s;", (self.key,))
-
-		if annotations and annotations.get("annotations"):
-			return json.loads(annotations["annotations"])
-		else:
-			return None
+		return self.db.fetchone("SELECT * FROM annotations WHERE dataset = %s;", (self.key,))
 
 	def save_annotation_fields(self, annotation_fields):
 		"""

From 9cfd5bd32f813d59c81684db318b906b83b469b9 Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Wed, 7 Aug 2024 11:40:02 +0200
Subject: [PATCH 116/204] First steps in revamping annotation saving

---
 VERSION                                     |   2 +-
 backend/database.sql                        |   2 +-
 backend/workers/expire_items.py             |   1 +
 common/lib/dataset.py                       | 540 ++++++++++++--------
 common/lib/exceptions.py                    |   5 +
 helper-scripts/migrate/migrate-1.45-1.46.py |  69 ++-
 webtool/static/js/explorer.js               | 130 +++--
 webtool/views/views_explorer.py             |  12 +-
 8 files changed, 479 insertions(+), 282 deletions(-)

diff --git a/VERSION b/VERSION
index 7a39f43c7..fa2cb2583 100644
--- a/VERSION
+++ b/VERSION
@@ -1,4 +1,4 @@
-1.44
+1.46
 
 This file should not be modified. It is used by 4CAT to determine whether it
 needs to run migration scripts to e.g. update the database structure to a more
diff --git a/backend/database.sql b/backend/database.sql
index f51df7e5d..7e551e5d8 100644
--- a/backend/database.sql
+++ b/backend/database.sql
@@ -79,7 +79,7 @@ CREATE TABLE IF NOT EXISTS annotations_new (
   options           TEXT,
   value             TEXT,
   author            TEXT,
-  is_processor      BOOLEAN DEFAULT FALSE,
+  by_processor      BOOLEAN DEFAULT FALSE,
   metadata          TEXT
 );
 
diff --git a/backend/workers/expire_items.py b/backend/workers/expire_items.py
index ed4d1cc0f..ddf8afbdb 100644
--- a/backend/workers/expire_items.py
+++ b/backend/workers/expire_items.py
@@ -62,6 +62,7 @@ def expire_datasets(self):
 				dataset = DataSet(key=dataset["key"], db=self.db)
 				if dataset.is_expired():
 					self.log.info(f"Deleting dataset {dataset.key} (expired)")
+					dataset.delete_annotations(dataset_key=dataset.key)
 					dataset.delete()
 
 			except DataSetNotFoundException:
diff --git a/common/lib/dataset.py b/common/lib/dataset.py
index 1d90111c7..1418494e1 100644
--- a/common/lib/dataset.py
+++ b/common/lib/dataset.py
@@ -19,7 +19,7 @@
 from common.lib.item_mapping import MappedItem, MissingMappedField, DatasetItem
 from common.lib.fourcat_module import FourcatModule
 from common.lib.exceptions import (ProcessorInterruptedException, DataSetException, DataSetNotFoundException,
-								   MapItemException, MappedItemIncompleteException)
+								   MapItemException, MappedItemIncompleteException, AnnotationException)
 
 
 class DataSet(FourcatModule):
@@ -835,6 +835,8 @@ def get_columns(self):
 	def get_annotation_fields(self):
 		"""
 		Retrieves the saved annotation fields for this dataset.
+		These are stored in the annotations table.
+
 		:return dict: The saved annotation fields.
 		"""
 
@@ -847,218 +849,12 @@ def get_annotation_fields(self):
 
 		return annotation_fields
 
-	def get_annotations(self):
-		"""
-		Retrieves the annotations for this dataset.
-		return list: All annotations, each in their own dictionary.
-		"""
-
-		return self.db.fetchone("SELECT * FROM annotations WHERE dataset = %s;", (self.key,))
-
-	def save_annotation_fields(self, annotation_fields):
-		"""
-		Save the annotation fields of a dataset to the datasets table.
-		If changes to the annotation fields affect older, existing annotations,
-		this function also updates or deletes those values.
-
-		:param dict annotation_fields:  Annotation fields, with a field ID as key
-		:return int:					The number of annotation fields saved.
-		"""
-
-		# Do some preparations
-		new_field_ids = set(annotation_fields.keys())
-		text_fields = ["textarea", "text"]
-		option_fields = set()
-
-		# Get existing annotation fields.
-		old_fields = self.get_annotation_fields()
-
-		# We're saving the new annotation fields as-is
-		self.db.execute("UPDATE datasets SET annotation_fields = %s WHERE key = %s;", (json.dumps(annotation_fields), self.key))
-
-		# If new annotation fields change the annotations already saved (e.g. if a field is deleted),
-		# we must also check if we should update annotation data.
-		# This can get quite complex!
-		if old_fields:
-			annotations = self.get_annotations()
-
-		if old_fields and annotations:
-
-			fields_to_delete = set()
-			labels_to_update = {}
-			options_to_delete = set()
-			options_to_update = {}
-
-			for field_id, field in old_fields.items():
-
-				# We'll delete all prior annotations for a field if its input field is deleted
-				if field_id not in new_field_ids:
-
-					# Labels are used as keys in the annotations table
-					# They should already be unique, so that's okay.
-					fields_to_delete.add(field["label"])
-					continue
-
-				# If the type has changed, also delete prior references (except between text and textarea)
-				new_type = annotation_fields[field_id]["type"]
-				if field["type"] != new_type:
-
-					if not field["type"] in text_fields and not new_type in text_fields:
-						fields_to_delete.add(field["label"])
-						continue
-
-				# If the label has changed, change it in the old annotations
-				old_label = old_fields[field_id]["label"]
-				new_label = annotation_fields[field_id]["label"]
-
-				if old_label != new_label:
-					labels_to_update[old_label] = new_label
-
-				# Check if the options for dropdowns or checkboxes have changed
-				if new_type == "checkbox" or new_type == "dropdown":
-
-					if "options" in old_fields[field_id]:
-
-						option_fields.add(old_fields[field_id]["label"])
-						new_options = annotation_fields[field_id]["options"]
-
-						new_ids = [list(v.keys())[0] for v in new_options]
-						new_ids = [list(v.keys())[0] for v in new_options]
-
-						# If it's a dropdown or checkbox..
-						for option in old_fields[field_id]["options"]:
-							option_id = list(option.keys())[0]
-							option_label = list(option.values())[0]
-
-							# If this ID is not present anymore, delete it
-							if option_id not in new_ids:
-								options_to_delete.add(option_label)
-								continue
-
-							# Change the label if it has changed. Bit ugly but it works.
-							new_label = [list(new_option.values())[0] for i, new_option in enumerate(new_options) if list(new_options[i].keys())[0] == option_id][0]
-
-							if option_label != new_label:
-								options_to_update[option_label] = new_label
-
-			# Loop through the old annotations if things need to be changed
-			if fields_to_delete or labels_to_update or options_to_update or options_to_delete:
-
-				for post_id in list(annotations.keys()):
-
-					for field_label in list(annotations[post_id].keys()):
-
-						# Delete the field entirely
-						if field_label in fields_to_delete:
-							del annotations[post_id][field_label]
-							continue
-
-						# Update the label
-						if field_label in labels_to_update:
-							annotations[post_id][labels_to_update[field_label]] = annotations[post_id].pop(field_label)
-							field_label = labels_to_update[field_label]
-
-						# Update or delete option values
-						if field_label in option_fields:
-							options_inserted = annotations[post_id][field_label]
-
-							# We can just delete/change the entire annotation if its a string
-							if type(options_inserted) == str:
-
-								# Delete the option if it's not present anymore
-								if options_inserted in options_to_delete:
-									del annotations[post_id][field_label]
-
-								# Update the option label if it has changed
-								elif options_inserted in options_to_update:
-									annotations[post_id][field_label] = options_to_update[options_inserted]
-
-							# For lists (i.e. checkboxes), we have to loop
-							elif type(options_inserted) == list:
-
-								for option_inserted in options_inserted:
-
-									# Delete the option if it's not present anymore
-									if option_inserted in options_to_delete:
-										annotations[post_id][field_label].remove(option_inserted)
-
-									# Update the option label if it has changed
-									elif option_inserted in options_to_update:
-										annotations[post_id][field_label] = options_to_update[option_inserted]
-
-					# Delete entire post dict if there's nothing left
-					if not annotations[post_id]:
-						del annotations[post_id]
-
-				# Save annotations as an empty string if there's none.
-				if not annotations:
-					annotations = ""
-				
-				# Save to the annotations table.
-				self.save_annotations(annotations)
-
-		return len(annotation_fields)
-
-	def save_annotations(self, annotations):
-		"""
-		Saves annotations for a dataset to the annotations table.
-
-		:param dict annotations:	Annotations dict, with post IDs as keys.
-		:return int:				The number of posts with annotations.
-
-		"""
-
-		# If there were already annotations added, we need to make sure
-		# we're not incorrectly overwriting existing ones.
-		# We also need to check whether any of the input fields has changed.
-		# If so, we're gonna edit or remove their old values.
-		old_annotations = self.get_annotations()
-		delete_annotations = False
-
-		if old_annotations and annotations:
-			# Loop through all new annotations and add/overwrite them
-			# with the old annotations dict.
-			for post_id in list(annotations.keys()):
-				old_annotations[post_id] = annotations[post_id]
-
-				# Empty strings, lists, or None as input values get removed
-				fields_to_delete = []
-				for label, values in old_annotations[post_id].items():
-					if not values:
-						fields_to_delete.append(label)
-				for label in fields_to_delete:
-					del old_annotations[post_id][label]
-					delete_annotations = True
-
-				# Empty lists/dicts get removed
-				if not old_annotations[post_id]:
-					del old_annotations[post_id]
-					delete_annotations = True
-
-			annotations = old_annotations
-
-		# If there's nothing to save or delete, do nothing
-		if not annotations and not delete_annotations:
-			return 0
-
-		# If the annotations are empty, remove the row from the annotations table
-		if len(annotations) == 0:
-			self.db.delete("annotations", {"key": self.key})
-			return 0
-
-		# If there's something to add or change,
-		# we're saving all annotations as a JSON string
-		annotations = json.dumps(annotations)
-		self.db.upsert("annotations", {"key": self.key, "annotations": annotations}, constraints=["key"])
-
-		return len(annotations)
-
 	def update_label(self, label):
 		"""
 		Update label for this dataset
 
-		:param str label:  New label
-		:return str:  The new label, as returned by get_label
+		:param str label:  	New label
+		:return str: 		The new label, as returned by get_label
 		"""
 		self.parameters["label"] = label
 
@@ -1798,6 +1594,332 @@ def warn_unmappable_item(self, item_count, processor=None, error_message=None, w
 				# No other log available
 				raise DataSetException(f"Unable to map item {item_count} for dataset {closest_dataset.key} and properly warn")
 
+	# Annotation features
+	def get_annotations(self):
+		"""
+		Retrieves the annotations for this dataset.
+		return list: All annotations, each in their own dictionary.
+		"""
+		annotations = self.db.fetchall("SELECT * FROM annotations WHERE dataset = %s;", (self.key,))
+
+		if not annotations:
+			annotations = None
+
+		return annotations
+
+	def has_annotations(self):
+		""" 
+		Returns True if there's one or more annotations found
+		"""
+
+		annotation = self.db.fetchone("SELECT * FROM annotations WHERE dataset = %s;", (self.key,))
+
+		return True if annotation else False
+
+	def has_annotation_fields(self):
+		""" 
+		Returns True if there's annotation fields saved tot the dataset table
+		"""
+
+		annotation_fields = self.get_annotation_fields()
+
+		return True if annotation_fields else False
+
+	def save_annotation_fields(self, new_fields, add=False):
+		"""
+		Save annotation field data to the datasets table (in the `annotation_fields` column).
+		If changes to the annotation fields affect existing annotations,
+		this function will also call `update_annotations_via_fields()` to change them.
+
+		:param dict new_fields:  Annotation fields, with a field ID as key.
+
+		:param bool add:				Wether we're merely adding new fields
+										or replacing the whole batch. If add is false,
+										`new_fields` should contain all fields.
+
+		:return int:					The number of annotation fields saved.
+
+		"""
+
+		# Get existing annotation fields to see if stuff changed.
+		old_fields = self.get_annotation_fields()
+		changes = False
+
+		# Do some validation
+		# Annotation field must be valid JSON.
+		try:
+			s = json.dumps(new_fields)
+		except ValueError:
+			raise AnnotationException("Can't save annotation fields: not valid JSON (%s)" % new_fields)
+
+		# Annotation fields must at minimum have `type` and `label` keys.
+		for field_id, annotation_field in new_fields.items():
+			if not isinstance(field_id, str):
+				raise AnnotationException("Can't save annotation fields: field ID %s is not a valid string" % field_id)
+			if not "label" in annotation_field:
+				raise AnnotationException("Can't save annotation fields: all fields must have a label" % field_id)
+			if not "type" in annotation_field:
+				raise AnnotationException("Can't save annotation fields: all fields must have a type" % field_id)
+
+			# Keep track of whether existing fields have changed; if so, we're going to
+			# update the annotations table.
+			if field_id in old_fields:
+				if old_fields[field_id] != annotation_field:
+					changes = True
+
+		# If we're just adding fields, add them to the old fields
+		# If the field already exists, overwrite the old field.
+		if add and old_fields:
+			all_fields = old_fields
+			for field_id, annotation_field in new_fields.items():
+				all_fields[field_id] = annotation_field
+			new_fields = all_fields
+
+		# We're saving the new annotation fields as-is.
+		# Ordering of fields is preserved this way.
+		self.db.execute("UPDATE datasets SET annotation_fields = %s WHERE key = %s;", (json.dumps(new_fields), self.key))
+
+		# If we're adding but the field already exists, update/delete annotations with that ID.
+		add_and_overlap = add and any([True for k in list(new_fields.keys()) if k in old_fields])
+
+		if changes or add_and_overlap:
+			update_annotations_via_fields(old_fields, new_fields)
+
+		return len(new_fields)
+
+	def update_annotations_via_fields(self, old_fields, new_fields):
+		"""
+		Updates annotations in the annotations table if the input field
+		itself has been changed, for instance if a dropdown label is renamed.
+	
+		:param di old_fields:	Old annotation fields
+		:param di new_fields:	New annotation fields; this should contain not just
+								additions, but all fields, changed or otherwise.
+		
+		"""
+
+		new_field_ids = set(annotation_fields.keys())
+		text_fields = ["textarea", "text"]
+		
+		# If old and new fields are identical, do nothing.
+		if old_fields == new_fields:
+			return
+
+		# Only update annotations if they, in fact, exist.
+		annotations = self.get_annotations()
+		if not annotations:
+			return
+
+		fields_to_delete = set()	# Delete all annotations with this field ID
+		fields_to_update = {} 		# Update values of annotations with this field ID
+
+		# Loop through the old annotation fields
+		for old_field_id, old_field in old_fields.items():
+
+			# Delete all annotations of this type if the field is deleted.
+			if old_field_id not in new_fields:
+				fields_to_delete.add(old_field_id)
+				continue
+
+			new_field = annotation_fields[old_field_id]
+
+			# If the annotation type has changed, also delete existing annotations,
+			# except between text and textarea, where we can just change the type and keep the text.
+			if old_field["type"] != new_field["type"]:
+				if not old_field["type"] in text_fields and not new_field["type"] in text_fields:
+					fields_to_delete.add(field_id)
+					continue
+
+			# Loop through all the key/values in the new field data
+			# and update in case it's different from the old values.
+			update_data = {}
+			for field_key, field_value in new_field.items():
+
+				# Update if values don't match
+				if field_value != old_field.get(field_key):
+
+					# Special case: option values that are removed/renamed.
+					# Here we only have to change specific values within the 
+					# values column.
+					if field_key == "options":
+
+						new_options = field_value
+						# Delete annotations of this type if all option fields are deleted
+						# (even though this should not be possible in the Explorer front-end)
+						if not new_options:
+							fields_to_delete.add(field_id)
+							continue
+
+						old_options = old_field["options"]
+						
+						options_to_update = {}
+
+						# Options are saved in a dict with IDs and labels as keys/values.
+						for old_option_id, old_option in old_options.items():
+							# Renamed option label
+							if old_option_id in new_options and old_option != new_options[old_option_id]:
+								options_to_update[old_option] = new_options[option] # Old label -> new label
+							# Deleted option
+							elif old_option_id not in new_options:
+								options_to_update[old_option] = None # Remove None labels
+
+						if options_to_update:
+							update_data[field_key] = {}
+							update_data[field_key]["options"] = options_to_update
+
+					# For all other changes, just overwrite with new data.
+					else:
+						update_data[field_key] = field_value
+
+			if update_data:
+				fields_to_update[new_field_id] = update_data
+
+		# Delete annotations
+		if fields_to_delete:
+			self.delete_annotations(field_id=list(fields_to_delete))
+
+		# Change annotations based on changes in update fields
+		if fields_to_update:
+			new_annotations = []
+			for annotation in annotations:
+				if annotation["field_id"] in fields_to_update:
+					for k, update_field in fields_to_update[annotation["field_id"]]:
+
+						# Special case: Changed options
+						if k == "options":
+							new_values = []
+							for inserted_option in annotations["value"].split(","):
+								if inserted_option in update_field:
+									if update_field[inserted_option] == None:
+										# Don't add
+										continue
+									elif inserted_option in update_field:
+										# Replace with new value
+										new_values.append(annotation["value"][old_option])
+									else:
+										# Keep old value
+										new_values.append(inserted_option)
+
+							update_field = new_values
+
+						annotation[k] = update_field
+
+				new_annotations.append(annotation)
+
+			# Save updated annotations
+			self.save_annotations(new_annotations)
+
+	def save_annotations(self, annotations, overwrite=True):
+		"""		
+		Takes a list of annotations and saves them to the annotations table.
+		If a field is not yet present in the datasets table, it also adds it there.
+
+		:param list annotations:		List of dictionaries with annotation items.
+		:param bool overwrite:			Whether to overwrite annotation if the label is already present
+										for the dataset.
+
+		:returns int:					How many annotations were saved.
+
+		"""
+
+		# Should be present for all annotation fields
+		mandatory_keys = ["post_id", "label", "value"]
+
+		field_keys = {}
+		annotations_to_delete = set()
+
+		# We're going to add the annotation metadata to the datasets table
+		# based on the annotations themselves.
+		annotation_fields = self.get_annotation_fields()
+		existing_annotations = self.get_annotations()
+		existing_labels = set(a["label"] for a in existing_annotations) if existing_annotations else []
+
+		timestamp = time.time()
+
+		new_annotations = []
+		for annotation in annotations:
+
+			# Do some validation; dataset key, post_id, label, and value need to be present.
+			missing_keys = []
+			for mandatory_key in mandatory_keys:
+				if mandatory_key not in annotation:
+					missing_keys.append(mandatory_key)
+			if missing_keys:
+				raise AnnotationException("Couldn't add annotations; missing field(s) %s" % ",".join(missing_keys))
+
+			# Add dataset key
+			annotation["dataset"] = self.key
+
+			# Raise exception if this label is already present for this dataset
+			# and we're not overwriting
+			if not overwrite and annotation["label"] in existing_labels:
+				raise AnnotationException("Couldn't save annotations; label %s already present")
+
+			# If there's no type given, use 'text'
+			if not annotation.get("type"):
+				annotation["type"] = "text"
+
+			# If there's no timestamp given, set it to the current time.
+			if not "timestamp" in annotation:
+				annotation["timestamp"] = timestamp
+				annotation["timestamp_created"] = timestamp
+
+			# If not already given, create an ID for this annotation
+			# based on the label, type, and dataset key.
+			if "field_id" not in annotation:
+				field_id_base = "-".join(annotation["dataset"], annotation["label"], annotation.get("type", ""))
+				field_id = int.from_bytes(field_id_base.encode(), "little")
+				annotation["field_id"] = field_id
+
+			# Add annotation metadata if it is not saved to the datasets table yet.
+			# This is just a simple dict with a field ID, type, label, and possible options. 
+			if annotation["field_id"] not in annotation_fields:
+				annotation_fields[annotation["field_id"]] = {
+					"label": annotation["label"],
+					"type": annotation["type"]
+				}
+				if "options" in annotation:
+					annotation_fields[annotation["field_id"]]["options"] = annotation["options"]
+
+			new_annotations.append(annotation)
+
+		# Save annotation fields if they're not present yet.
+		if annotation_fields != self.get_annotation_fields():
+			self.save_annotation_fields(annotation_fields)
+
+		# If there's nothing to save or delete, do nothing
+		if not new_annotations:
+			return 0
+
+		# Overwrite old annotations with upsert. Else add.
+		self.db.upsert("annotations", new_annotations, constraints=["dataset", "post_id", "label"])
+
+		return len(new_annotations)
+
+	def delete_annotations(self, dataset_key=None, id=None, field_id=None):
+		"""
+		Deletes all annotations for an entire dataset or by a list of (field) IDs.
+
+		:param str dataset_key: A dataset key.
+		:param li id:			A list or string of unique annotation IDs.
+		:param li field_id:		A list or string of IDs for annotation fields.
+
+		:return int: The number of removed records.
+		"""
+
+		if not dataset and not ids and not field_ids:
+			return 0
+
+		where = {}
+		if dataset_key:
+			where["dataset"] = dataset_key
+		if ids:
+			where["id"] = ids
+		if field_ids:
+			where["field_id"] = field_ids
+
+		return self.db.delete("annotations", where)
+
 	def __getattr__(self, attr):
 		"""
 		Getter so we don't have to use .data all the time
diff --git a/common/lib/exceptions.py b/common/lib/exceptions.py
index 01bd9813f..f187b4258 100644
--- a/common/lib/exceptions.py
+++ b/common/lib/exceptions.py
@@ -44,6 +44,11 @@ class ProcessorException(FourcatException):
     """
     pass
 
+class AnnotationException(FourcatException):
+    """
+    Raise for exceptions with setting/getting annotations.
+    """
+    pass
 
 class MapItemException(ProcessorException):
     """
diff --git a/helper-scripts/migrate/migrate-1.45-1.46.py b/helper-scripts/migrate/migrate-1.45-1.46.py
index 2246d0ea2..0a650773e 100644
--- a/helper-scripts/migrate/migrate-1.45-1.46.py
+++ b/helper-scripts/migrate/migrate-1.45-1.46.py
@@ -21,7 +21,37 @@
 db = Database(logger=log, dbname=db_config["db_name"], user=db_config["db_user"], password=db_config["db_password"],
               host=db_config["db_host"], port=db_config["db_port"], appname="4cat-migrate")
 
-print("  Creating new annotations table...")
+
+datasets = db.fetchall("SELECT * FROM datasets WHERE annotation_fields != ''")
+
+print("  Converting annotation options from lists to dicts...")
+for dataset in datasets:
+
+    annotation_fields = dataset["annotation_fields"]
+
+    # Flatten options from list of dicts to dict
+    options_converted = False
+    annotation_fields = json.loads(annotation_fields)
+    new_annotation_fields = annotation_fields
+
+    for field_id, annotation_field in annotation_fields.items():
+
+        if "options" in annotation_field:
+
+            flattened_options = {}
+            if isinstance(annotation_field["options"], list):
+                for op in annotation_field["options"]:
+                    flattened_options.update(op)
+                new_annotation_fields[field_id]["options"] = flattened_options
+                options_converted = True
+
+    if options_converted:
+        print("    Converting annotation options to list for dataset %s..." % dataset["key"])
+        db.execute("UPDATE datasets SET annotation_fields = %s WHERE key = %s;", (json.dumps(new_annotation_fields), dataset["key"]))
+
+print("  Expanding the 'annotations' table.")
+
+print("    Creating new annotations table...")
 db.execute("""
 CREATE TABLE IF NOT EXISTS annotations_new (
   id                SERIAL PRIMARY KEY,
@@ -35,12 +65,12 @@
   options           TEXT,
   value             TEXT,
   author            TEXT,
-  is_processor      BOOLEAN DEFAULT FALSE,
+  by_processor      BOOLEAN DEFAULT FALSE,
   metadata          TEXT
 );
 """)
 
-print("  Creating indexes for annotations table...")
+print("    Creating indexes for annotations table...")
 db.execute("""
 CREATE UNIQUE INDEX IF NOT EXISTS annotation_id
   ON annotations_new (
@@ -62,42 +92,42 @@
 );
 """)
 
-print("  Transferring old annotations to new annotations table...")
+print("    Transferring old annotations to new annotations table...")
 
 annotations = db.fetchall("SELECT * FROM annotations;")
 
 if not annotations:
-    print("  No annotation fields to transfer, skipping...")
+    print("    No annotation fields to transfer, skipping...")
 
 else:
-    print("  Transferring annotations")
     
     count = 0
     skipped_count = 0
 
-    columns = "post_id,field_id,dataset,timestamp,timestamp_created,label,type,options,value,author,is_processor,metadata"
+    columns = "post_id,field_id,dataset,timestamp,timestamp_created,label,type,options,value,author,by_processor,metadata"
 
     # Each row are **all** annotations per dataset
     for row in annotations:
-        
-        if not row.get("annotations"):
-            print("    No annotations for dataset %s, skipping..." % row["key"])
-            skipped_count += 1
-            continue
 
-        dataset = db.fetchone("SELECT * FROM datasets WHERE key = '" + row["key"] + "';")
+        dataset = db.fetchone("SELECT * FROM datasets WHERE key = '" + row["dataset"] + "';")
         
         # If the dataset is not present anymore,
         # we're going to skip these annotations;
         # likely the dataset is expired.
         if not dataset:
-            print("    No dataset found for key %s, skipping..." % row["key"])
+            print("      No dataset found for key %s, skipping..." % row["dataset"])
             skipped_count += 1
             continue
 
         annotation_fields = json.loads(dataset["annotation_fields"])
         author = dataset.get("creator", "")
 
+        
+        if not row.get("annotations"):
+            print("      No annotations for dataset %s, skipping..." % row["dataset"])
+            skipped_count += 1
+            continue
+
         # Loop through all annotated posts
         for post_id, post_annotations in json.loads(row["annotations"]).items():
 
@@ -112,7 +142,7 @@
                     
                 # Skip if this field was not saved to the datasets table
                 if not field_id or field_id not in annotation_fields:
-                    print("    Annotation field ID not saved to datasets table, skipping...")
+                    print("      Annotation field ID not saved to datasets table, skipping...")
                     skipped_count += 1
                     continue
 
@@ -126,7 +156,7 @@
                 inserts = [(
                     str(post_id),           # post_id; needs to be a string, changes per data source.
                     int(field_id),          # field_id; this is an ID for the same type of input field.
-                    row["key"],             # dataset
+                    row["dataset"],             # dataset
                     dataset["timestamp"],   # timestamp
                     dataset["timestamp"],   # timestamp_created
                     label,                  # label
@@ -134,7 +164,7 @@
                     json.dumps(options) if options else "",    # options; each option has a key and a value.
                     value,                  # value
                     author,                 # author
-                    False,                  # is_processor
+                    False,                  # by_processor
                     json.dumps({}),         # metadata
                 )]
 
@@ -143,9 +173,10 @@
                 count += 1
 
         if count % 10 == 0:
-            print("    Transferred %s annotations..." % count)
+            print("      Transferred %s annotations..." % count)
         
-print("  Done, transferred %s annotations and skipped %s annotations" % (count, skipped_count))
+    print("    Done, transferred %s annotations and skipped %s annotations" % (count, skipped_count))
+
 print("  Deleting old annotations table...")
 db.execute("DROP TABLE annotations")
 
diff --git a/webtool/static/js/explorer.js b/webtool/static/js/explorer.js
index c6a9daab7..75f97a515 100644
--- a/webtool/static/js/explorer.js
+++ b/webtool/static/js/explorer.js
@@ -215,8 +215,10 @@ const annotations = {
 	},
 
 	parseAnnotationFields: function (e) {
-		// Validates and converts the fields in the annotations editor.
-		// Returns an object with the set annotation fields.
+		/*
+		Validates and converts the fields in the annotations editor.
+		Returns an object with the set annotation fields.
+		*/
 
 		var annotation_fields = {};
 		var warning = "";
@@ -237,7 +239,7 @@ const annotations = {
 
 			let label = label_field.val().replace(/\s+/g, ' ');
 
-			// Get the random identifier of the field, so we
+			// Get the ID of the field, so we
 			// can later check if it already exists.
 			let field_id = parseInt(this.id.split("-")[1]);
 
@@ -246,7 +248,7 @@ const annotations = {
 				label_field.addClass("invalid");
 				warning  = "Input names can't be empty";
 			}
-			// Make sure the names can't be duplicates.
+			// Make sure the names can't be duplicates
 			else if (labels_added.includes(label)) {
 				warning = "Field labels must be unique";
 				label_field.addClass("invalid");
@@ -280,7 +282,7 @@ const annotations = {
 
 					if (!option_labels.includes(option_label) && option_label.length > 0) {
 
-						// We're using a unique key for these to match input fields.
+						// We're using a unique key for options as well.
 						option = {}
 						option[option_id] = option_label
 						options.push(option);
@@ -317,6 +319,74 @@ const annotations = {
 		return annotation_fields;
 	},
 
+	parseAnnotation: function(e) {
+		/*
+		Converts the DOM objects of an annotation field
+		to an annotation Object.
+
+		Must be given an input field element
+
+		*/
+
+		annotation = {}
+
+		let label = $(this).find(".annotation-label").text();
+		let annotation_type = $(this).attr("class").split(" ").pop();
+		let val = undefined;
+		let edited = false
+		let timestamp = Date.now() / 100
+
+		if (annotation_type == "text" || annotation_type == "textarea") {
+			val = $(this).find(".post-annotation-input").val();
+			// It can be the case that the input text is deleted
+			// In this case we *do* want to push new data, so we check
+			// whether there's an 'edited' class present and save if so.
+			if ($(this).find(".post-annotation-input").hasClass("edited")) {
+				edited = true
+			}
+		}
+		else if (annotation_type == "dropdown") {
+			let selected = $(this).find(".post-annotation-options").val();
+			val = selected;
+		}
+		else if (annotation_type == "checkbox") {
+			val = [];
+			$(this).find(".post-annotation-options > input").each(function(){
+				if ($(this).is(":checked")) {
+					val.push($(this).val());
+				}
+				if ($(this).hasClass("edited")) {
+					edited = true
+				}
+			});
+			if (!val.length > 0) {
+				val = undefined;
+			}
+		}
+		if ((val != undefined && val != "") || edited) {
+			vals_changed = true;
+			val = "";
+		}
+
+		// Create an annotation object and add them to the array.
+		let annotation = {
+			"field_id": "",
+			"post_id": post_id,
+			"dataset": "",
+			"timestamp": timestamp,
+			"timestamp_created": "",
+			"label": label,
+			"type": annotation_type,
+			"options": ,
+			"value": "",
+			"author": "",
+			"by_processor": "",
+			"metadata": ""
+		}
+
+		return annotation
+	},
+
 	applyAnnotationFields: function (e){
 		// Applies the annotation fields to each post on this page.
 
@@ -631,9 +701,9 @@ const annotations = {
 	saveAnnotations: function (e){
 		// Write the annotations to the dataset and annotations table.
 
-		// First we're gonna collect the data for this page.
-		// Loop through each post's annotation field.
-		var anns = {};
+		// First we're going to collect the data for this page.
+		// Loop through each post's annotation fields.
+		var anns = [];
 		var dataset_key = $("#dataset-key").text();
 
 		$(".posts > li").each(function(){
@@ -644,50 +714,20 @@ const annotations = {
 
 			if (post_annotations.length > 0) {
 
-				let post_vals = {};
 				post_annotations.find(".post-annotation").each(function(){
 					
-					let label = $(this).find(".annotation-label").text();
-					let annotation_type = $(this).attr("class").split(" ").pop();
-					let val = "";
-					let edited = false
-
-					if (annotation_type == "text" || annotation_type == "textarea") {
-						val = $(this).find(".post-annotation-input").val();
-						// It can be the case that the input text is deleted
-						// In this case we *do* want to push new data, so we check
-						// whether there's an 'edited' class present and save if so.
-						if ($(this).find(".post-annotation-input").hasClass("edited")) {
-							edited = true
-						}
-					}
-					else if (annotation_type == "dropdown") {
-						let selected = $(this).find(".post-annotation-options").val();
-						val = selected;
-					}
-					else if (annotation_type == "checkbox") {
-						val = [];
-						$(this).find(".post-annotation-options > input").each(function(){
-							if ($(this).is(":checked")) {
-								val.push($(this).val());
-							}
-							if ($(this).hasClass("edited")) {
-								edited = true
-							}
-						});
-						if (!val.length > 0) {
-							val = undefined;
-						}
-					}
-					if ((val != undefined && val != "") || edited) {
-						vals_changed = true;
-						post_vals[label] = val;
+					// Extract annotation object from the element
+					let annotation = parseAnnotation(this);
+
+					if (annotation) {
+						annotations.push(annotation);
 					}
 				});
 
 				if (vals_changed){
-					anns[post_id] = post_vals;
+					annotation[post_id] = post_vals;
 				}
+
 			}
 		})
 		
diff --git a/webtool/views/views_explorer.py b/webtool/views/views_explorer.py
index 18d657387..466735a0f 100644
--- a/webtool/views/views_explorer.py
+++ b/webtool/views/views_explorer.py
@@ -132,11 +132,9 @@ def explorer_dataset(key, page=1):
 
 	# Check whether there's already annotations inserted already.
 	# If so, also pass these to the template.
-	annotations = db.fetchone("SELECT * FROM annotations WHERE key = %s", (key,))
-	if not annotations or not annotations.get("annotations"):
-		annotations = None
-	else:
-		annotations = json.loads(annotations["annotations"])
+	annotations = db.fetchall("SELECT * FROM annotations WHERE key = %s", (key,))
+	if annotations:
+		annotations = json.loads(annotations)
 	
 	# Generate the HTML page
 	return render_template("explorer/explorer.html", dataset=dataset, datasource=datasource, has_database=has_database, posts=posts, annotation_fields=annotation_fields, annotations=annotations, template=template, posts_css=posts_css, page=page, offset=offset, posts_per_page=posts_per_page, post_count=post_count, max_posts=max_posts, warning=warning)
@@ -221,7 +219,7 @@ def explorer_api_posts(datasource, post_ids):
 @openapi.endpoint("explorer")
 def explorer_save_annotation_fields(key):
 	"""
-	Save teh annotation fields of a dataset to the datasets table.
+	Save the annotation fields of a dataset to the datasets table.
 
 	:param str key:  	The dataset key.
 
@@ -255,7 +253,7 @@ def explorer_save_annotations(key):
 
 	:param str key: 	The dataset key.
 
-	:return-error 404:  If the dataset ID does not exist.
+	:return-error 404:  If the dataset key does not exist.
 	:return int:		The number of posts with annotations saved.
 	"""
 

From e95c4bd33bffc09c15ca0a5f684c2b3415cb0815 Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Wed, 7 Aug 2024 12:47:50 +0200
Subject: [PATCH 117/204] remove unused variables in explorer.js

---
 webtool/static/js/explorer.js | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/webtool/static/js/explorer.js b/webtool/static/js/explorer.js
index c6a9daab7..ae317e82d 100644
--- a/webtool/static/js/explorer.js
+++ b/webtool/static/js/explorer.js
@@ -74,7 +74,7 @@ const annotations = {
 
 		// Delete an entire annotation input
 		// We're in a grid of threes, so this involves three divs
-		editor_controls.on("click", ".annotation-field > .delete-input", function(e){
+		editor_controls.on("click", ".annotation-field > .delete-input", function(){
 				let parent_div = $(this).parent().parent();
 				parent_div.next().remove(); // Input type
 				parent_div.next().remove(); // Options
@@ -105,11 +105,12 @@ const annotations = {
 		});
 		
 		// Make saving available when annotations are changed
-		$(".post-annotations").on("keydown", "input, textarea", function() { annotations.enableSaving(); edits_made = true;});
-		$(".post-annotations").on("click", "option, input[type=checkbox], label", function() { annotations.enableSaving(); edits_made = true;});
+		let post_annotations = $(".post-annotations");
+		post_annotations.on("keydown", "input, textarea", function() { annotations.enableSaving(); edits_made = true;});
+		post_annotations.on("click", "option, input[type=checkbox], label", function() { annotations.enableSaving(); edits_made = true;});
 
 		// Keep track of whether the annotations are edited or not.
-		$(".post-annotations").on("keydown change", ".post-annotation-input, .post-annotation input, .post-annotation textarea", function(){$(this).addClass("edited")});
+		post_annotations.on("keydown change", ".post-annotation-input, .post-annotation input, .post-annotation textarea", function(){$(this).addClass("edited")});
 
 		// Save the annotations to the database
 		$("#save-annotations").on("click", function(){
@@ -125,8 +126,6 @@ const annotations = {
 			}
 		})
 
-		var old_annotation_fields = $("#annotation-fields").html();
-
 		// Check whether there's already fields saved for this dataset
 		annotations.fieldsExist();
 
@@ -143,7 +142,6 @@ const annotations = {
 		// Change the type of input fields when switching in the dropdown
 
 		let type = $(el).val();
-		let old_type = $(el).attr("data-val");
 
 		let options = $(el).parent().parent().next();
 		let option_fields = options.find(".option-field");

From 2fdb87640140ef8ec591185e2d15f8ce1599b9aa Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Wed, 7 Aug 2024 17:52:24 +0200
Subject: [PATCH 118/204] First steps in giving annotations their own class

---
 backend/database.sql                        |   4 +-
 common/lib/annotation.py                    | 265 ++++++++++++++++++++
 common/lib/dataset.py                       | 164 ++++--------
 helper-scripts/migrate/migrate-1.45-1.46.py |   1 -
 processors/metrics/count_posts.py           |   7 +
 webtool/static/js/explorer.js               |  14 +-
 webtool/views/views_explorer.py             |  23 +-
 7 files changed, 343 insertions(+), 135 deletions(-)
 create mode 100644 common/lib/annotation.py

diff --git a/backend/database.sql b/backend/database.sql
index 7e551e5d8..eafc9aaf8 100644
--- a/backend/database.sql
+++ b/backend/database.sql
@@ -68,9 +68,9 @@ CREATE TABLE datasets_owners (
 CREATE UNIQUE INDEX datasets_owners_user_key_idx ON datasets_owners("name" text_ops,key text_ops);
 
 -- annotations
-CREATE TABLE IF NOT EXISTS annotations_new (
+CREATE TABLE IF NOT EXISTS annotations (
   id                SERIAL PRIMARY KEY,
-  post_id           TEXT,
+  item_id           TEXT,
   dataset           TEXT,
   timestamp         INT DEFAULT 0,
   timestamp_created INT DEFAULT 0,
diff --git a/common/lib/annotation.py b/common/lib/annotation.py
new file mode 100644
index 000000000..771e28343
--- /dev/null
+++ b/common/lib/annotation.py
@@ -0,0 +1,265 @@
+"""
+Annotation class
+"""
+
+from common.config_manager import config
+
+class Annotation:
+    """
+    Annotation class
+
+    Annotations are always tied to a dataset and an item ID.
+
+    """
+
+    # Attributes must be created here to ensure getattr and setattr work properly
+
+    data = None
+    db = None
+
+    id = ""                 # Unique ID for this annotation
+    parent_id = ""          # ID of the data for this annotation, e.g. post ID
+    dataset = ""            # Dataset key this annotation is generated from
+    timestamp = 0           # When this annotation was edited
+    timestamp_created = 0   # When this timestamp was created
+    label = ""              # Label of annotation
+    options = []            # Possible options
+    value = ""              # The actual annotation value
+    author = ""             # Who made the annotation
+    by_processor = False    # Whether the annotation was made by a processor
+    metadata = {}           # Misc metadata
+
+    def __init__(self, db, data, id=None, item_id=None, label=None, dataset_key=None):
+        """
+        Instantiate annotation object.
+
+        :param db:  Database connection object
+        :param dict data:  Annotation data; should correspond to the annotations table records.
+
+        """
+
+        self.db = db
+        self.data = data
+        self.item_id = item_id
+
+        if id is not None:
+            self.id = id
+            current = self.db.fetchone("SELECT * FROM annotations WHERE key = %s", (self.id,))
+            if not current:
+                raise AnnotationException(
+                    "Annotation() requires a valid ID for its 'id' argument, \"%s\" given" % id)
+
+        # Should be present for all annotation fields
+        mandatory_keys = ["post_id", "label", "value"]
+
+
+        if dataset_key is not None and label is not None and dataset_key is not None:
+            current = self.db.fetchone("SELECT * FROM annotations WHERE key = %s", (self.key,))
+            if not current:
+                raise DataSetNotFoundException(
+                    "DataSet() requires a valid dataset key for its 'key' argument, \"%s\" given" % key)
+
+
+    def get_by_id(db, id):
+        """
+        Get annotation by ID
+
+        :param db:  Database connection object
+        :param str name:  ID of annotation
+        :return:  Annotation object, or `None` for invalid annotation ID
+        """
+        data = db.fetchone("SELECT * FROM annotations WHERE id = %s", (id,))
+        if not annotation:
+            return None
+        else:
+            return Annotation.get_by_data(db, data)
+
+    def get_by_data(db, data):
+        """
+        Instantiate annotation object with given data
+
+        :param db:          Database handler
+        :param dict data:   Annotation data, should correspond to a database row
+        :return Annotation: Annotation object
+        """
+        return Annotation(db, data)
+
+    def set_id_by_data(self, item):
+        """
+        Creates an ID based on the data of the item it has annotated.
+
+
+        """
+
+
+        return True
+
+    def save(self):
+        """
+        Save an annotation to the database.
+        """
+        return True
+
+    @staticmethod
+    def save_many(self, annotations, overwrite=True):
+        """
+        Takes a list of annotations and saves them to the annotations table.
+        If a field is not yet present in the datasets table, it also adds it there.
+
+        :param bool overwrite:			Whether to overwrite annotation if the label is already present
+                                        for the dataset.
+
+        :returns int:					How many annotations were saved.
+
+        """
+
+        field_keys = {}
+        annotations_to_delete = set()
+
+        # We're going to add the annotation metadata to the datasets table
+        # based on the annotations themselves.
+        annotation_fields = self.get_annotation_fields()
+        existing_annotations = self.get_annotations()
+        existing_labels = set(a["label"] for a in existing_annotations) if existing_annotations else []
+
+        timestamp = time.time()
+
+        new_annotations = []
+        for annotation in annotations:
+
+            # Do some validation; dataset key, post_id, label, and value need to be present.
+            missing_keys = []
+            for mandatory_key in mandatory_keys:
+                if mandatory_key not in annotation:
+                    missing_keys.append(mandatory_key)
+            if missing_keys:
+                raise AnnotationException("Couldn't add annotations; missing field(s) %s" % ",".join(missing_keys))
+
+            # Add dataset key
+            annotation["dataset"] = self.key
+
+            # Raise exception if this label is already present for this dataset
+            # and we're not overwriting
+            if not overwrite and annotation["label"] in existing_labels:
+                raise AnnotationException("Couldn't save annotations; label %s already present")
+
+            # If there's no type given, use 'text'
+            if not annotation.get("type"):
+                annotation["type"] = "text"
+
+            # If there's no timestamp given, set it to the current time.
+            if not "timestamp" in annotation:
+                annotation["timestamp"] = timestamp
+                annotation["timestamp_created"] = timestamp
+
+            # If not already given, create an ID for this annotation
+            # based on the label, type, and dataset key.
+            if "field_id" not in annotation:
+                field_id_base = "-".join(annotation["dataset"], annotation["label"], annotation.get("type", ""))
+                field_id = int.from_bytes(field_id_base.encode(), "little")
+                annotation["field_id"] = field_id
+
+            # Add annotation metadata if it is not saved to the datasets table yet.
+            # This is just a simple dict with a field ID, type, label, and possible options.
+            if annotation["field_id"] not in annotation_fields:
+                annotation_fields[annotation["field_id"]] = {
+                    "label": annotation["label"],
+                    "type": annotation["type"]
+                }
+                if "options" in annotation:
+                    annotation_fields[annotation["field_id"]]["options"] = annotation["options"]
+
+            new_annotations.append(annotation)
+
+        # Save annotation fields if they're not present yet.
+        if annotation_fields != self.get_annotation_fields():
+            self.save_annotation_fields(annotation_fields)
+
+        # If there's nothing to save or delete, do nothing
+        if not new_annotations:
+            return 0
+
+        # Overwrite old annotations with upsert. Else add.
+        self.db.upsert("annotations", new_annotations, constraints=["dataset", "post_id", "label"])
+
+        return len(new_annotations)
+
+    def delete(self):
+        """
+        Deletes this annotation
+        """
+        return self.db.delete("annotations", {"id": self.id})
+
+    @staticmethod
+    def delete_many(self, dataset_key=None, id=None, field_id=None):
+        """
+        Deletes annotations for an entire dataset or by a list of (field) IDs.
+
+        :param str dataset_key: A dataset key.
+        :param li id:			A list or string of unique annotation IDs.
+        :param li field_id:		A list or string of IDs for annotation fields.
+
+        :return int: The number of removed records.
+        """
+        if not dataset_key and not id and not field_id:
+            return 0
+
+        where = {}
+        if dataset_key:
+            where["dataset"] = dataset_key
+        if id:
+            where["id"] = id
+        if field_id:
+            where["field_id"] = field_id
+
+        return self.db.delete("annotations", where)
+
+
+    def __getattr__(self, attr):
+        """
+        Getter so we don't have to use .data all the time
+
+        :param attr:  Data key to get
+        :return:  Value
+        """
+
+        if attr in dir(self):
+            # an explicitly defined attribute should always be called in favour
+            # of this passthrough
+            attribute = getattr(self, attr)
+            return attribute
+        elif attr in self.data:
+            return self.data[attr]
+        else:
+            raise AttributeError("Annotation instance has no attribute %s" % attr)
+
+    def __setattr__(self, attr, value):
+        """
+        Setter so we can flexibly update the database
+
+        Also updates internal data stores (.data etc). If the attribute is
+        unknown, it is stored within the 'metadata' attribute.
+
+        :param str attr:  Attribute to update
+        :param value:  New value
+        """
+
+        # don't override behaviour for *actual* class attributes
+        if attr in dir(self):
+            super().__setattr__(attr, value)
+            return
+
+        if attr not in self.data:
+            self.parameters[attr] = value
+            attr = "metadata"
+            value = self.parameters
+
+        if attr == "metadata":
+            value = json.dumps(value)
+
+        self.db.update("annotations", where={"id": self.id}, data={attr: value})
+
+        self.data[attr] = value
+
+        if attr == "metadata":
+            self.parameters = json.loads(value)
\ No newline at end of file
diff --git a/common/lib/dataset.py b/common/lib/dataset.py
index 1418494e1..92dcfc625 100644
--- a/common/lib/dataset.py
+++ b/common/lib/dataset.py
@@ -10,13 +10,12 @@
 import csv
 import re
 
-from pathlib import Path
-
 import backend
 from common.config_manager import config
+from common.lib.annotation import Annotation
 from common.lib.job import Job, JobNotFoundException
-from common.lib.helpers import get_software_commit, NullAwareTextIOWrapper, convert_to_int, convert_to_float, flatten_dict
-from common.lib.item_mapping import MappedItem, MissingMappedField, DatasetItem
+from common.lib.helpers import get_software_commit, NullAwareTextIOWrapper, convert_to_int
+from common.lib.item_mapping import MappedItem, DatasetItem
 from common.lib.fourcat_module import FourcatModule
 from common.lib.exceptions import (ProcessorInterruptedException, DataSetException, DataSetNotFoundException,
 								   MapItemException, MappedItemIncompleteException, AnnotationException)
@@ -83,20 +82,17 @@ def __init__(self, parameters=None, key=None, job=None, data=None, db=None, pare
 			if not current:
 				raise DataSetNotFoundException("DataSet() requires a valid dataset key for its 'key' argument, \"%s\" given" % key)
 
-			query = current["query"]
 		elif job is not None:
 			current = self.db.fetchone("SELECT * FROM datasets WHERE parameters::json->>'job' = %s", (job,))
 			if not current:
 				raise DataSetNotFoundException("DataSet() requires a valid job ID for its 'job' argument")
 
-			query = current["query"]
 			self.key = current["key"]
 		elif data is not None:
 			current = data
 			if "query" not in data or "key" not in data or "parameters" not in data or "key_parent" not in data:
 				raise DataSetException("DataSet() requires a complete dataset record for its 'data' argument")
 
-			query = current["query"]
 			self.key = current["key"]
 		else:
 			if parameters is None:
@@ -217,7 +213,7 @@ def clear_log(self):
 		extension.
 		"""
 		log_path = self.get_log_path()
-		with log_path.open("w") as outfile:
+		with log_path.open("w"):
 			pass
 
 	def log(self, log):
@@ -347,7 +343,7 @@ def iterate_items(self, processor=None, warn_unmappable=True, map_missing="defau
 		if annotation_fields:
 			annotations = self.get_annotations()
 
-    	# missing field strategy can be for all fields at once, or per field
+		# missing field strategy can be for all fields at once, or per field
 		# if it is per field, it is a dictionary with field names and their strategy
 		# if it is for all fields, it is may be a callback, 'abort', or 'default'
 		default_strategy = "default"
@@ -832,23 +828,6 @@ def get_columns(self):
 			# Filetype not CSV or an NDJSON with `map_item`
 			return []
 
-	def get_annotation_fields(self):
-		"""
-		Retrieves the saved annotation fields for this dataset.
-		These are stored in the annotations table.
-
-		:return dict: The saved annotation fields.
-		"""
-
-		annotation_fields = self.db.fetchone("SELECT annotation_fields FROM datasets WHERE key = %s;", (self.key,))
-		
-		if annotation_fields and annotation_fields.get("annotation_fields"):
-			annotation_fields = json.loads(annotation_fields["annotation_fields"])
-		else:
-			annotation_fields = {}
-
-		return annotation_fields
-
 	def update_label(self, label):
 		"""
 		Update label for this dataset
@@ -908,7 +887,6 @@ def change_datasource(self, datasource):
 		"""
 		Change the datasource type for this dataset
 
-		:param str label:  New datasource type
 		:return str:  The new datasource type
 		"""
 
@@ -1604,6 +1582,8 @@ def get_annotations(self):
 
 		if not annotations:
 			annotations = None
+		else:
+			annotations = [Annotation(data=annotation, dataset=self) for annotation in annotations]
 
 		return annotations
 
@@ -1625,15 +1605,32 @@ def has_annotation_fields(self):
 
 		return True if annotation_fields else False
 
+	def get_annotation_fields(self):
+		"""
+		Retrieves the saved annotation fields for this dataset.
+		These are stored in the annotations table.
+
+		:return dict: The saved annotation fields.
+		"""
+
+		annotation_fields = self.db.fetchone("SELECT annotation_fields FROM datasets WHERE key = %s;", (self.key,))
+
+		if annotation_fields and annotation_fields.get("annotation_fields"):
+			annotation_fields = json.loads(annotation_fields["annotation_fields"])
+		else:
+			annotation_fields = None
+
+		return annotation_fields
+
 	def save_annotation_fields(self, new_fields, add=False):
 		"""
 		Save annotation field data to the datasets table (in the `annotation_fields` column).
 		If changes to the annotation fields affect existing annotations,
 		this function will also call `update_annotations_via_fields()` to change them.
 
-		:param dict new_fields:  Annotation fields, with a field ID as key.
+		:param dict new_fields:  		New annotation fields, with a field ID as key.
 
-		:param bool add:				Wether we're merely adding new fields
+		:param bool add:				Whether we're merely adding new fields
 										or replacing the whole batch. If add is false,
 										`new_fields` should contain all fields.
 
@@ -1648,7 +1645,7 @@ def save_annotation_fields(self, new_fields, add=False):
 		# Do some validation
 		# Annotation field must be valid JSON.
 		try:
-			s = json.dumps(new_fields)
+			json.dumps(new_fields)
 		except ValueError:
 			raise AnnotationException("Can't save annotation fields: not valid JSON (%s)" % new_fields)
 
@@ -1683,7 +1680,7 @@ def save_annotation_fields(self, new_fields, add=False):
 		add_and_overlap = add and any([True for k in list(new_fields.keys()) if k in old_fields])
 
 		if changes or add_and_overlap:
-			update_annotations_via_fields(old_fields, new_fields)
+			self.update_annotations_via_fields(old_fields, new_fields)
 
 		return len(new_fields)
 
@@ -1698,7 +1695,6 @@ def update_annotations_via_fields(self, old_fields, new_fields):
 		
 		"""
 
-		new_field_ids = set(annotation_fields.keys())
 		text_fields = ["textarea", "text"]
 		
 		# If old and new fields are identical, do nothing.
@@ -1721,7 +1717,8 @@ def update_annotations_via_fields(self, old_fields, new_fields):
 				fields_to_delete.add(old_field_id)
 				continue
 
-			new_field = annotation_fields[old_field_id]
+			field_id = old_field_id
+			new_field = new_fields[field_id]
 
 			# If the annotation type has changed, also delete existing annotations,
 			# except between text and textarea, where we can just change the type and keep the text.
@@ -1758,7 +1755,7 @@ def update_annotations_via_fields(self, old_fields, new_fields):
 						for old_option_id, old_option in old_options.items():
 							# Renamed option label
 							if old_option_id in new_options and old_option != new_options[old_option_id]:
-								options_to_update[old_option] = new_options[option] # Old label -> new label
+								options_to_update[old_option] = new_options[old_option_id] # Old label -> new label
 							# Deleted option
 							elif old_option_id not in new_options:
 								options_to_update[old_option] = None # Remove None labels
@@ -1772,7 +1769,7 @@ def update_annotations_via_fields(self, old_fields, new_fields):
 						update_data[field_key] = field_value
 
 			if update_data:
-				fields_to_update[new_field_id] = update_data
+				fields_to_update[field_id] = update_data
 
 		# Delete annotations
 		if fields_to_delete:
@@ -1782,7 +1779,7 @@ def update_annotations_via_fields(self, old_fields, new_fields):
 		if fields_to_update:
 			new_annotations = []
 			for annotation in annotations:
-				if annotation["field_id"] in fields_to_update:
+				if annotation.field_id in fields_to_update:
 					for k, update_field in fields_to_update[annotation["field_id"]]:
 
 						# Special case: Changed options
@@ -1795,7 +1792,7 @@ def update_annotations_via_fields(self, old_fields, new_fields):
 										continue
 									elif inserted_option in update_field:
 										# Replace with new value
-										new_values.append(annotation["value"][old_option])
+										new_values.append(update_field[inserted_option])
 									else:
 										# Keep old value
 										new_values.append(inserted_option)
@@ -1810,7 +1807,7 @@ def update_annotations_via_fields(self, old_fields, new_fields):
 			self.save_annotations(new_annotations)
 
 	def save_annotations(self, annotations, overwrite=True):
-		"""		
+		"""
 		Takes a list of annotations and saves them to the annotations table.
 		If a field is not yet present in the datasets table, it also adds it there.
 
@@ -1822,79 +1819,20 @@ def save_annotations(self, annotations, overwrite=True):
 
 		"""
 
-		# Should be present for all annotation fields
-		mandatory_keys = ["post_id", "label", "value"]
-
-		field_keys = {}
-		annotations_to_delete = set()
-
-		# We're going to add the annotation metadata to the datasets table
-		# based on the annotations themselves.
-		annotation_fields = self.get_annotation_fields()
-		existing_annotations = self.get_annotations()
-		existing_labels = set(a["label"] for a in existing_annotations) if existing_annotations else []
-
-		timestamp = time.time()
-
-		new_annotations = []
-		for annotation in annotations:
-
-			# Do some validation; dataset key, post_id, label, and value need to be present.
-			missing_keys = []
-			for mandatory_key in mandatory_keys:
-				if mandatory_key not in annotation:
-					missing_keys.append(mandatory_key)
-			if missing_keys:
-				raise AnnotationException("Couldn't add annotations; missing field(s) %s" % ",".join(missing_keys))
-
-			# Add dataset key
-			annotation["dataset"] = self.key
-
-			# Raise exception if this label is already present for this dataset
-			# and we're not overwriting
-			if not overwrite and annotation["label"] in existing_labels:
-				raise AnnotationException("Couldn't save annotations; label %s already present")
-
-			# If there's no type given, use 'text'
-			if not annotation.get("type"):
-				annotation["type"] = "text"
-
-			# If there's no timestamp given, set it to the current time.
-			if not "timestamp" in annotation:
-				annotation["timestamp"] = timestamp
-				annotation["timestamp_created"] = timestamp
-
-			# If not already given, create an ID for this annotation
-			# based on the label, type, and dataset key.
-			if "field_id" not in annotation:
-				field_id_base = "-".join(annotation["dataset"], annotation["label"], annotation.get("type", ""))
-				field_id = int.from_bytes(field_id_base.encode(), "little")
-				annotation["field_id"] = field_id
-
-			# Add annotation metadata if it is not saved to the datasets table yet.
-			# This is just a simple dict with a field ID, type, label, and possible options. 
-			if annotation["field_id"] not in annotation_fields:
-				annotation_fields[annotation["field_id"]] = {
-					"label": annotation["label"],
-					"type": annotation["type"]
-				}
-				if "options" in annotation:
-					annotation_fields[annotation["field_id"]]["options"] = annotation["options"]
-
-			new_annotations.append(annotation)
-
-		# Save annotation fields if they're not present yet.
-		if annotation_fields != self.get_annotation_fields():
-			self.save_annotation_fields(annotation_fields)
-
-		# If there's nothing to save or delete, do nothing
-		if not new_annotations:
+		if not annotations:
 			return 0
 
-		# Overwrite old annotations with upsert. Else add.
-		self.db.upsert("annotations", new_annotations, constraints=["dataset", "post_id", "label"])
+		# Add dataset info to annotations
+		key = self.key
+		owner = self.get_owners()[0]
+		if "dataset"
+			for i in range(len(annotations)):
+				if not annotations[i].get("dataset"):
+					annotations[i]["dataset"] = key
+				if not annotations[i].get("author"):
+					annotations[i]["author"] = owner
 
-		return len(new_annotations)
+		return Annotation.save_many(annotations, overwrite=overwrite)
 
 	def delete_annotations(self, dataset_key=None, id=None, field_id=None):
 		"""
@@ -1907,16 +1845,16 @@ def delete_annotations(self, dataset_key=None, id=None, field_id=None):
 		:return int: The number of removed records.
 		"""
 
-		if not dataset and not ids and not field_ids:
+		if not dataset_key and not id and not field_id:
 			return 0
 
 		where = {}
 		if dataset_key:
 			where["dataset"] = dataset_key
-		if ids:
-			where["id"] = ids
-		if field_ids:
-			where["field_id"] = field_ids
+		if id:
+			where["id"] = id
+		if field_id:
+			where["field_id"] = field_id
 
 		return self.db.delete("annotations", where)
 
diff --git a/helper-scripts/migrate/migrate-1.45-1.46.py b/helper-scripts/migrate/migrate-1.45-1.46.py
index 0a650773e..e0248cfcf 100644
--- a/helper-scripts/migrate/migrate-1.45-1.46.py
+++ b/helper-scripts/migrate/migrate-1.45-1.46.py
@@ -122,7 +122,6 @@
         annotation_fields = json.loads(dataset["annotation_fields"])
         author = dataset.get("creator", "")
 
-        
         if not row.get("annotations"):
             print("      No annotations for dataset %s, skipping..." % row["dataset"])
             skipped_count += 1
diff --git a/processors/metrics/count_posts.py b/processors/metrics/count_posts.py
index 3114d4049..ea1ef48f5 100644
--- a/processors/metrics/count_posts.py
+++ b/processors/metrics/count_posts.py
@@ -3,6 +3,7 @@
 """
 
 from common.lib.helpers import UserInput, pad_interval, get_interval_descriptor
+from common.lib.annotation import Annotation
 from backend.lib.processor import BasicProcessor
 
 __author__ = "Stijn Peeters"
@@ -51,11 +52,17 @@ def process(self):
 		first_interval = "9999"
 		last_interval = "0000"
 
+		annotations = []
+
 		self.dataset.update_status("Processing items")
 		with self.dataset.get_results_path().open("w") as results:
 			counter = 0
 
 			for post in self.source_dataset.iterate_items(self):
+
+				annotation = Annotation(value="test", label="count_posts_test", dataset=self.source_dataset)
+				annotations.append(annotation)
+
 				try:
 					date = get_interval_descriptor(post, timeframe)
 				except ValueError as e:
diff --git a/webtool/static/js/explorer.js b/webtool/static/js/explorer.js
index 062782967..35104d28b 100644
--- a/webtool/static/js/explorer.js
+++ b/webtool/static/js/explorer.js
@@ -2,7 +2,7 @@ $(document).ready(function(){
 
 $(init);
 
-/**
+/*
  * Page init
  */
 function init() {
@@ -15,7 +15,7 @@ function init() {
 
 }
 
-/**
+/*
  * Handle annotations
  */
 const annotations = {
@@ -334,7 +334,7 @@ const annotations = {
 		let edited = false
 		let timestamp = Date.now() / 100
 
-		if (annotation_type == "text" || annotation_type == "textarea") {
+		if (annotation_type === "text" || annotation_type === "textarea") {
 			val = $(this).find(".post-annotation-input").val();
 			// It can be the case that the input text is deleted
 			// In this case we *do* want to push new data, so we check
@@ -343,11 +343,11 @@ const annotations = {
 				edited = true
 			}
 		}
-		else if (annotation_type == "dropdown") {
+		else if (annotation_type === "dropdown") {
 			let selected = $(this).find(".post-annotation-options").val();
 			val = selected;
 		}
-		else if (annotation_type == "checkbox") {
+		else if (annotation_type === "checkbox") {
 			val = [];
 			$(this).find(".post-annotation-options > input").each(function(){
 				if ($(this).is(":checked")) {
@@ -361,7 +361,7 @@ const annotations = {
 				val = undefined;
 			}
 		}
-		if ((val != undefined && val != "") || edited) {
+		if ((val !== undefined && val !== "") || edited) {
 			vals_changed = true;
 			val = "";
 		}
@@ -375,7 +375,7 @@ const annotations = {
 			"timestamp_created": "",
 			"label": label,
 			"type": annotation_type,
-			"options": ,
+			"options": "",
 			"value": "",
 			"author": "",
 			"by_processor": "",
diff --git a/webtool/views/views_explorer.py b/webtool/views/views_explorer.py
index 466735a0f..a3acb82d3 100644
--- a/webtool/views/views_explorer.py
+++ b/webtool/views/views_explorer.py
@@ -4,7 +4,6 @@
 """
 
 import json
-import re
 
 from pathlib import Path
 
@@ -13,6 +12,7 @@
 from webtool import app, db, openapi, limiter, config
 from webtool.lib.helpers import error, setting_required
 from common.lib.dataset import DataSet
+from common.lib.annotation import Annotation
 from common.lib.helpers import convert_to_float
 from common.lib.exceptions import DataSetException
 from common.config_manager import ConfigWrapper
@@ -132,9 +132,7 @@ def explorer_dataset(key, page=1):
 
 	# Check whether there's already annotations inserted already.
 	# If so, also pass these to the template.
-	annotations = db.fetchall("SELECT * FROM annotations WHERE key = %s", (key,))
-	if annotations:
-		annotations = json.loads(annotations)
+	annotations = dataset.get_annotations()
 	
 	# Generate the HTML page
 	return render_template("explorer/explorer.html", dataset=dataset, datasource=datasource, has_database=has_database, posts=posts, annotation_fields=annotation_fields, annotations=annotations, template=template, posts_css=posts_css, page=page, offset=offset, posts_per_page=posts_per_page, post_count=post_count, max_posts=max_posts, warning=warning)
@@ -247,7 +245,7 @@ def explorer_save_annotation_fields(key):
 @setting_required("privileges.can_run_processors")
 @setting_required("privileges.can_use_explorer")
 @openapi.endpoint("explorer")
-def explorer_save_annotations(key):
+def explorer_save_annotations(key=None):
 	"""
 	Save the annotations of a dataset to the annotations table.
 
@@ -257,19 +255,20 @@ def explorer_save_annotations(key):
 	:return int:		The number of posts with annotations saved.
 	"""
 
-	# Get dataset.
+	# Save it!
+	annotations = request.get_json()
+
+	# Annotations are always associated with a dataset.
+	if not key and annotations:
+		key = annotations[0].get("dataset", "")
 	if not key:
 		return error(404, error="No dataset key provided")
 	try:
 		dataset = DataSet(key=key, db=db)
 	except DataSetException:
 		return error(404, error="Dataset not found.")
-
-	# Save it!
-	new_annotations = request.get_json()
-	dataset.save_annotations(new_annotations)
-
-	return "success"
+	
+	return dataset.save_annotations(annotations)
 
 def sort_and_iterate_items(dataset, sort=None, reverse=False, **kwargs):
 	"""

From 6ddae4e47eeb5b2857dea74b10791661bedea22b Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Thu, 8 Aug 2024 17:21:08 +0200
Subject: [PATCH 119/204] Fix mistakes in database.sql

---
 backend/database.sql | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/backend/database.sql b/backend/database.sql
index eafc9aaf8..c474eb773 100644
--- a/backend/database.sql
+++ b/backend/database.sql
@@ -71,6 +71,7 @@ CREATE UNIQUE INDEX datasets_owners_user_key_idx ON datasets_owners("name" text_
 CREATE TABLE IF NOT EXISTS annotations (
   id                SERIAL PRIMARY KEY,
   item_id           TEXT,
+  field_id          TEXT,
   dataset           TEXT,
   timestamp         INT DEFAULT 0,
   timestamp_created INT DEFAULT 0,
@@ -84,21 +85,21 @@ CREATE TABLE IF NOT EXISTS annotations (
 );
 
 CREATE UNIQUE INDEX IF NOT EXISTS annotation_id
-  ON annotations_new (
+  ON annotations (
     id
 );
 CREATE UNIQUE INDEX IF NOT EXISTS annotation_unique
-  ON annotations_new (
+  ON annotations (
     label,
     dataset,
-    post_id
+    item_id
 );
 CREATE INDEX IF NOT EXISTS annotation_value
-  ON annotations_new (
+  ON annotations (
     value
 );
 CREATE INDEX IF NOT EXISTS annotation_timestamp
-  ON annotations_new (
+  ON annotations (
     timestamp
 );
 

From f679b7aad3ac8abcd0dcec2fb94d76bfa20821ad Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Thu, 8 Aug 2024 17:21:28 +0200
Subject: [PATCH 120/204] Make Annotation object usable

---
 common/lib/annotation.py | 239 ++++++++++++++++++++++-----------------
 common/lib/dataset.py    |  36 ++++--
 2 files changed, 163 insertions(+), 112 deletions(-)

diff --git a/common/lib/annotation.py b/common/lib/annotation.py
index 771e28343..8cda9d0dd 100644
--- a/common/lib/annotation.py
+++ b/common/lib/annotation.py
@@ -2,13 +2,18 @@
 Annotation class
 """
 
-from common.config_manager import config
+
+import time
+import json
+
+from common.lib.exceptions import AnnotationException
 
 class Annotation:
     """
     Annotation class
 
-    Annotations are always tied to a dataset and an item ID.
+    Annotations are always tied to a dataset, a dataset item (e.g. a csv row),
+    an annotation label, and a type ('text', 'multichoice', etc.).
 
     """
 
@@ -17,88 +22,166 @@ class Annotation:
     data = None
     db = None
 
-    id = ""                 # Unique ID for this annotation
-    parent_id = ""          # ID of the data for this annotation, e.g. post ID
-    dataset = ""            # Dataset key this annotation is generated from
-    timestamp = 0           # When this annotation was edited
-    timestamp_created = 0   # When this timestamp was created
-    label = ""              # Label of annotation
-    options = []            # Possible options
-    value = ""              # The actual annotation value
-    author = ""             # Who made the annotation
-    by_processor = False    # Whether the annotation was made by a processor
-    metadata = {}           # Misc metadata
-
-    def __init__(self, db, data, id=None, item_id=None, label=None, dataset_key=None):
+    id = None                 # Unique ID for this annotation
+    item_id = None            # ID of the item for this annotation, e.g. post ID
+    field_id = None           # If of this type of annotation field for this dataset
+    dataset = None            # Dataset key this annotation is generated from
+    timestamp = None          # When this annotation was edited
+    timestamp_created = None  # When this timestamp was created
+    label = None              # Label of annotation
+    options = None            # Possible options
+    value = None              # The actual annotation value
+    author = None             # Who made the annotation
+    by_processor = None       # Whether the annotation was made by a processor
+    metadata = None           # Misc metadata
+
+    def __init__(self, data=None, id=None, db=None):
         """
         Instantiate annotation object.
 
-        :param db:  Database connection object
-        :param dict data:  Annotation data; should correspond to the annotations table records.
-
+        :param data:    Annotation data; should correspond to the annotations table record.
+        :param id:      The ID of an annotation. If given, it retrieves the annotation
+                        from the database.
+        :param db:      Database connection object
         """
 
-        self.db = db
-        self.data = data
-        self.item_id = item_id
+        required_fields = ["label", "item_id", "dataset"]
 
-        if id is not None:
-            self.id = id
-            current = self.db.fetchone("SELECT * FROM annotations WHERE key = %s", (self.id,))
-            if not current:
-                raise AnnotationException(
-                    "Annotation() requires a valid ID for its 'id' argument, \"%s\" given" % id)
+        # Must have an ID or data
+        if id is None and (data is None or not isinstance(data, dict)):
+            raise AnnotationException("Annotation() requires either a `data` dictionary or ID.")
 
-        # Should be present for all annotation fields
-        mandatory_keys = ["post_id", "label", "value"]
+        if not db:
+            raise AnnotationException("Annotation() needs a `db` database object")
 
+        self.db = db
 
-        if dataset_key is not None and label is not None and dataset_key is not None:
-            current = self.db.fetchone("SELECT * FROM annotations WHERE key = %s", (self.key,))
+        current = None
+        new_or_updated = False
+
+        # Get the annotation data if the ID is given; if an annotation has
+        # an ID, it is guaranteed to be in the database.
+        # IDs can both be explicitly given or present in the data dict.
+        if id is not None or "id" in data:
+            if "id" in data:
+                id = data["id"]
+            self.id = id # IDs correspond to unique serial numbers in the database.
+            current = self.db.fetchone("SELECT * FROM annotations WHERE id = %s" % (self.id))
             if not current:
-                raise DataSetNotFoundException(
-                    "DataSet() requires a valid dataset key for its 'key' argument, \"%s\" given" % key)
-
+                raise AnnotationException(
+                    "Annotation() requires a valid ID for its 'id' argument, %s given" % id)
 
-    def get_by_id(db, id):
+        # If an ID is not given, get or create an Annotation object from its data.
+        # First check if required fields are present in `data`.
+        else:
+            for required_field in required_fields:
+                if required_field not in data or not data[required_field]:
+                    raise AnnotationException("Annotation() requires a %s field" % required_field)
+
+            # Check if this annotation already exists, based on the data
+            current = self.get_by_field(data["dataset"], data["item_id"], data["label"])
+
+        # If we were able to retrieve an annotation from the db, it already exists
+        if current:
+            # Check if we have to overwrite old data with new data
+            if data:
+                for key, value in data.items():
+                    # Save unknown fields in metadata
+                    if key not in current:
+                        current["metadata"][key] = value
+                        new_or_updated = True
+                    # Else update the value
+                    elif current[key] != value:
+                        current[key] = value
+                        new_or_updated = True
+
+            self.data = current
+
+        # If this is a new annotation, set all the properties.
+        else:
+            # Keep track of when the annotation was made
+            created_timestamp = int(time.time())
+            # Store unknown properties in `metadata`
+            metadata = {k: v for k, v in data.items() if k not in self.__dict__}
+            print(self.__dict__)
+            print(metadata)
+            new_data = {
+                "item_id": data["item_id"],
+                "field_id": data["field_id"] if data.get("field_id") else self.get_field_id(data["dataset"], data["label"]),
+                "dataset": data["dataset"],
+                "timestamp_created": timestamp,
+                "label": data["label"],
+                "type": data.get("type", "text"),
+                "options": data.get("options", ""),
+                "value": data.get("value", ""),
+                "author": data.get("author", ""),
+                "by_processor": data.get("by_processor", False),
+                "metadata": metadata
+            }
+            self.data = new_data
+            new_or_updated = True
+
+        # Write to db if anything changed
+        if new_or_updated:
+            timestamp = int(time.time())
+            self.timestamp = timestamp
+            self.write_to_db()
+
+    def get_by_id(self, id):
         """
         Get annotation by ID
 
-        :param db:  Database connection object
         :param str name:  ID of annotation
         :return:  Annotation object, or `None` for invalid annotation ID
         """
-        data = db.fetchone("SELECT * FROM annotations WHERE id = %s", (id,))
-        if not annotation:
-            return None
-        else:
-            return Annotation.get_by_data(db, data)
 
-    def get_by_data(db, data):
-        """
-        Instantiate annotation object with given data
+        try:
+            int(id)
+        except ValueError:
+            raise AnnotationException("Id '%s' is not valid" % id)
 
-        :param db:          Database handler
-        :param dict data:   Annotation data, should correspond to a database row
-        :return Annotation: Annotation object
-        """
-        return Annotation(db, data)
+        return Annotation(id=id)
 
-    def set_id_by_data(self, item):
+    def get_by_field(self, dataset_key, item_id, label):
         """
-        Creates an ID based on the data of the item it has annotated.
+        Get the annotation information via its dataset key, item ID, and label.
+        This is always a unique comibination.
 
+        :param dataset_key:     The key of the dataset this annotation was made for.
+        :param item_id:         The ID of the item this annotation was made for.
+        :param label:           The label of the annotation.
 
+        :return data: A dict with data of the retrieved annotation, or None if it doesn't exist.
         """
 
+        data = self.db.fetchone("SELECT * FROM annotations WHERE dataset = %s AND item_id = %s AND label = %s",
+                         (dataset_key, item_id, label))
+        if not data:
+            return None
 
-        return True
+        data["metadata"] = json.loads(data["metadata"])
+        return data
 
-    def save(self):
+    def get_field_id(self, dataset_key, label):
         """
-        Save an annotation to the database.
+        Sets a `field_id` based on the dataset key and label.
+        This combination should be unique.
+
+        :param dataset_key: The dataset key
+        :param label:       The label of the dataset.
         """
-        return True
+        field_id_base = "-".join([dataset_key, label])
+        field_id = int.from_bytes(field_id_base.encode(), "little")
+        self.field_id = field_id
+        return field_id
+
+    def write_to_db(self):
+        """
+        Write an annotation to the database.
+        """
+        data = self.data
+        data["metadata"] = json.dumps(data["metadata"])
+        return self.db.upsert("annotations", data=data, constraints=["dataset", "label", "item_id"])
 
     @staticmethod
     def save_many(self, annotations, overwrite=True):
@@ -112,53 +195,8 @@ def save_many(self, annotations, overwrite=True):
         :returns int:					How many annotations were saved.
 
         """
-
-        field_keys = {}
-        annotations_to_delete = set()
-
-        # We're going to add the annotation metadata to the datasets table
-        # based on the annotations themselves.
-        annotation_fields = self.get_annotation_fields()
-        existing_annotations = self.get_annotations()
-        existing_labels = set(a["label"] for a in existing_annotations) if existing_annotations else []
-
-        timestamp = time.time()
-
         new_annotations = []
         for annotation in annotations:
-
-            # Do some validation; dataset key, post_id, label, and value need to be present.
-            missing_keys = []
-            for mandatory_key in mandatory_keys:
-                if mandatory_key not in annotation:
-                    missing_keys.append(mandatory_key)
-            if missing_keys:
-                raise AnnotationException("Couldn't add annotations; missing field(s) %s" % ",".join(missing_keys))
-
-            # Add dataset key
-            annotation["dataset"] = self.key
-
-            # Raise exception if this label is already present for this dataset
-            # and we're not overwriting
-            if not overwrite and annotation["label"] in existing_labels:
-                raise AnnotationException("Couldn't save annotations; label %s already present")
-
-            # If there's no type given, use 'text'
-            if not annotation.get("type"):
-                annotation["type"] = "text"
-
-            # If there's no timestamp given, set it to the current time.
-            if not "timestamp" in annotation:
-                annotation["timestamp"] = timestamp
-                annotation["timestamp_created"] = timestamp
-
-            # If not already given, create an ID for this annotation
-            # based on the label, type, and dataset key.
-            if "field_id" not in annotation:
-                field_id_base = "-".join(annotation["dataset"], annotation["label"], annotation.get("type", ""))
-                field_id = int.from_bytes(field_id_base.encode(), "little")
-                annotation["field_id"] = field_id
-
             # Add annotation metadata if it is not saved to the datasets table yet.
             # This is just a simple dict with a field ID, type, label, and possible options.
             if annotation["field_id"] not in annotation_fields:
@@ -214,7 +252,6 @@ def delete_many(self, dataset_key=None, id=None, field_id=None):
 
         return self.db.delete("annotations", where)
 
-
     def __getattr__(self, attr):
         """
         Getter so we don't have to use .data all the time
diff --git a/common/lib/dataset.py b/common/lib/dataset.py
index 92dcfc625..223c4cd77 100644
--- a/common/lib/dataset.py
+++ b/common/lib/dataset.py
@@ -1605,6 +1605,20 @@ def has_annotation_fields(self):
 
 		return True if annotation_fields else False
 
+	def make_annotations(self, annotations):
+		"""
+		Generates a list of annotation objects from annotation JSON.
+		:param annotations: A list of dicts or JSON string with annotations 
+		"""
+		if not annotations:
+			return None
+		if isinstance(annotations, str):
+			annotaitons = json.loads(annotations)
+
+		annotations = [Annotation(annotation, self) for annotation in annotations]
+
+		return annotations
+
 	def get_annotation_fields(self):
 		"""
 		Retrieves the saved annotation fields for this dataset.
@@ -1822,17 +1836,17 @@ def save_annotations(self, annotations, overwrite=True):
 		if not annotations:
 			return 0
 
-		# Add dataset info to annotations
-		key = self.key
-		owner = self.get_owners()[0]
-		if "dataset"
-			for i in range(len(annotations)):
-				if not annotations[i].get("dataset"):
-					annotations[i]["dataset"] = key
-				if not annotations[i].get("author"):
-					annotations[i]["author"] = owner
-
-		return Annotation.save_many(annotations, overwrite=overwrite)
+		# Add some dataset data to annotations, if not present
+		for annotation in annotations:
+			# Set dataset key
+			if not annotation.get("dataset"):
+				annotation["dataset"] = self.key
+			# Set default author to this dataset owner
+			if not annotation.get("author"):
+				annotation["author"] = self.get_owners()[0]
+
+			# Create Annotation object, which saves it to the database
+			Annotation(data=annotation, db=self.db)
 
 	def delete_annotations(self, dataset_key=None, id=None, field_id=None):
 		"""

From 24425e157aebbc1561bd1e091d01c7167e293e15 Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Fri, 9 Aug 2024 17:55:01 +0200
Subject: [PATCH 121/204] General annotations improvements and make processors
 save annotations

---
 backend/database.sql                        |   4 +-
 backend/lib/processor.py                    |  55 +++
 common/lib/annotation.py                    | 227 ++++++++---
 common/lib/database.py                      |   2 +-
 common/lib/dataset.py                       | 412 +++++++++-----------
 helper-scripts/migrate/migrate-1.45-1.46.py |  10 +-
 processors/metrics/count_posts.py           |   4 +-
 7 files changed, 402 insertions(+), 312 deletions(-)

diff --git a/backend/database.sql b/backend/database.sql
index c474eb773..01e124eaa 100644
--- a/backend/database.sql
+++ b/backend/database.sql
@@ -70,9 +70,9 @@ CREATE UNIQUE INDEX datasets_owners_user_key_idx ON datasets_owners("name" text_
 -- annotations
 CREATE TABLE IF NOT EXISTS annotations (
   id                SERIAL PRIMARY KEY,
-  item_id           TEXT,
-  field_id          TEXT,
   dataset           TEXT,
+  field_id          TEXT,
+  item_id           TEXT,
   timestamp         INT DEFAULT 0,
   timestamp_created INT DEFAULT 0,
   label             TEXT,
diff --git a/backend/lib/processor.py b/backend/lib/processor.py
index 24b7b4a11..5a0aaff52 100644
--- a/backend/lib/processor.py
+++ b/backend/lib/processor.py
@@ -712,6 +712,59 @@ def create_standalone(self):
 
 		return standalone
 
+	def write_annotations(self, annotations: list, source_dataset=None, overwrite=False) -> int:
+		"""
+		Saves annotations made by this processor on the basis of another dataset.
+		Also adds some data regarding this processor: set `author` and `label` to processor name,
+		and add parameters to `metadata` (unless explicitly indicated).
+
+		:param annotations:		List of dictionaries with annotation items. Must have `item_id` and `value`.
+								E.g. [{"item_id": "12345", "label": "Valid", "value": "Yes"}]
+		:param source_dataset:	The dataset that these annotations were based on.
+								Defaults to the parent dataset.
+		:param bool overwrite:	Whether to overwrite annotations if the label is already present
+								for the dataset. If this is False and the label is already present,
+								we'll add a number to the label to differentiate it (e.g. `count-posts1`).
+								Else we'll just replace the old data.
+
+		:returns int:			How many annotations were saved.
+
+		"""
+
+		if not annotations:
+			return 0
+
+		# Default to parent dataset
+		if not source_dataset:
+			source_dataset = self.source_dataset
+
+		# Check if this dataset already has annotation fields
+		existing_labels = source_dataset.get_annotation_field_labels()
+
+		# Set some values
+		for annotation in annotations:
+
+			# Set the default author and label to this processor's name
+			if not annotation.get("label"):
+				# If the processor has already generated annotation fields,
+				# add a number to differentiate the label
+				label = self.name
+				if not overwrite and label in existing_labels:
+					label += "-" + str(len([l for l in existing_labels if l.startswith(label)]))
+				annotation["label"] = label
+			if not annotation.get("author"):
+				annotation["author"] = self.name
+
+			annotation["by_processor"] = True
+
+			# Add processor parameters to annotation metadata
+			if not annotation.get("metadata"):
+				annotation["metadata"] = {}
+			annotation["metadata"]["processor-parameters"] = self.parameters
+
+		annotations_saved = source_dataset.save_annotations(annotations, overwrite=overwrite)
+		return annotations_saved
+
 	@classmethod
 	def map_item_method_available(cls, dataset):
 		"""
@@ -847,6 +900,8 @@ def get_extension(self, parent_dataset=None):
 			# A non filter processor updated the base Processor extension to None/False?
 			return None
 
+
+
 	@classmethod
 	def is_rankable(cls, multiple_items=True):
 		"""
diff --git a/common/lib/annotation.py b/common/lib/annotation.py
index 8cda9d0dd..b871006a9 100644
--- a/common/lib/annotation.py
+++ b/common/lib/annotation.py
@@ -5,6 +5,7 @@
 
 import time
 import json
+import hashlib
 
 from common.lib.exceptions import AnnotationException
 
@@ -49,7 +50,7 @@ def __init__(self, data=None, id=None, db=None):
 
         # Must have an ID or data
         if id is None and (data is None or not isinstance(data, dict)):
-            raise AnnotationException("Annotation() requires either a `data` dictionary or ID.")
+            raise AnnotationException("Annotation() requires either a valid `data` dictionary or ID.")
 
         if not db:
             raise AnnotationException("Annotation() needs a `db` database object")
@@ -69,7 +70,7 @@ def __init__(self, data=None, id=None, db=None):
             current = self.db.fetchone("SELECT * FROM annotations WHERE id = %s" % (self.id))
             if not current:
                 raise AnnotationException(
-                    "Annotation() requires a valid ID for its 'id' argument, %s given" % id)
+                    "Annotation() requires a valid ID for an existing annotation, %s given" % id)
 
         # If an ID is not given, get or create an Annotation object from its data.
         # First check if required fields are present in `data`.
@@ -78,7 +79,7 @@ def __init__(self, data=None, id=None, db=None):
                 if required_field not in data or not data[required_field]:
                     raise AnnotationException("Annotation() requires a %s field" % required_field)
 
-            # Check if this annotation already exists, based on the data
+            # Check if this annotation already exists, based on dataset key, item id, and label.
             current = self.get_by_field(data["dataset"], data["item_id"], data["label"])
 
         # If we were able to retrieve an annotation from the db, it already exists
@@ -90,7 +91,7 @@ def __init__(self, data=None, id=None, db=None):
                     if key not in current:
                         current["metadata"][key] = value
                         new_or_updated = True
-                    # Else update the value
+                    # If values differ, update the value
                     elif current[key] != value:
                         current[key] = value
                         new_or_updated = True
@@ -99,39 +100,39 @@ def __init__(self, data=None, id=None, db=None):
 
         # If this is a new annotation, set all the properties.
         else:
+
             # Keep track of when the annotation was made
             created_timestamp = int(time.time())
-            # Store unknown properties in `metadata`
-            metadata = {k: v for k, v in data.items() if k not in self.__dict__}
-            print(self.__dict__)
-            print(metadata)
+
             new_data = {
                 "item_id": data["item_id"],
                 "field_id": data["field_id"] if data.get("field_id") else self.get_field_id(data["dataset"], data["label"]),
                 "dataset": data["dataset"],
-                "timestamp_created": timestamp,
+                "timestamp": 0,
+                "timestamp_created": created_timestamp,
                 "label": data["label"],
                 "type": data.get("type", "text"),
                 "options": data.get("options", ""),
                 "value": data.get("value", ""),
                 "author": data.get("author", ""),
                 "by_processor": data.get("by_processor", False),
-                "metadata": metadata
+                "metadata": data.get("metadata", {}),
             }
+
             self.data = new_data
             new_or_updated = True
 
         # Write to db if anything changed
         if new_or_updated:
-            timestamp = int(time.time())
-            self.timestamp = timestamp
+            self.data["timestamp"] = int(time.time())
+            print(self.data)
             self.write_to_db()
 
-    def get_by_id(self, id):
+    def get_by_id(self, id: int):
         """
         Get annotation by ID
 
-        :param str name:  ID of annotation
+        :param str id:  ID of annotation
         :return:  Annotation object, or `None` for invalid annotation ID
         """
 
@@ -142,10 +143,10 @@ def get_by_id(self, id):
 
         return Annotation(id=id)
 
-    def get_by_field(self, dataset_key, item_id, label):
+    def get_by_field(self, dataset_key: str, item_id: str, label: str) -> dict:
         """
         Get the annotation information via its dataset key, item ID, and label.
-        This is always a unique comibination.
+        This is always a unique combination.
 
         :param dataset_key:     The key of the dataset this annotation was made for.
         :param item_id:         The ID of the item this annotation was made for.
@@ -157,12 +158,12 @@ def get_by_field(self, dataset_key, item_id, label):
         data = self.db.fetchone("SELECT * FROM annotations WHERE dataset = %s AND item_id = %s AND label = %s",
                          (dataset_key, item_id, label))
         if not data:
-            return None
+            return {}
 
         data["metadata"] = json.loads(data["metadata"])
         return data
 
-    def get_field_id(self, dataset_key, label):
+    def get_field_id(self, dataset_key: str, label: str) -> str:
         """
         Sets a `field_id` based on the dataset key and label.
         This combination should be unique.
@@ -170,8 +171,7 @@ def get_field_id(self, dataset_key, label):
         :param dataset_key: The dataset key
         :param label:       The label of the dataset.
         """
-        field_id_base = "-".join([dataset_key, label])
-        field_id = int.from_bytes(field_id_base.encode(), "little")
+        field_id = hashlib.md5(dataset_key + label.encode("utf-8")).hexdigest()
         self.field_id = field_id
         return field_id
 
@@ -179,48 +179,10 @@ def write_to_db(self):
         """
         Write an annotation to the database.
         """
-        data = self.data
-        data["metadata"] = json.dumps(data["metadata"])
-        return self.db.upsert("annotations", data=data, constraints=["dataset", "label", "item_id"])
-
-    @staticmethod
-    def save_many(self, annotations, overwrite=True):
-        """
-        Takes a list of annotations and saves them to the annotations table.
-        If a field is not yet present in the datasets table, it also adds it there.
-
-        :param bool overwrite:			Whether to overwrite annotation if the label is already present
-                                        for the dataset.
-
-        :returns int:					How many annotations were saved.
-
-        """
-        new_annotations = []
-        for annotation in annotations:
-            # Add annotation metadata if it is not saved to the datasets table yet.
-            # This is just a simple dict with a field ID, type, label, and possible options.
-            if annotation["field_id"] not in annotation_fields:
-                annotation_fields[annotation["field_id"]] = {
-                    "label": annotation["label"],
-                    "type": annotation["type"]
-                }
-                if "options" in annotation:
-                    annotation_fields[annotation["field_id"]]["options"] = annotation["options"]
-
-            new_annotations.append(annotation)
-
-        # Save annotation fields if they're not present yet.
-        if annotation_fields != self.get_annotation_fields():
-            self.save_annotation_fields(annotation_fields)
-
-        # If there's nothing to save or delete, do nothing
-        if not new_annotations:
-            return 0
-
-        # Overwrite old annotations with upsert. Else add.
-        self.db.upsert("annotations", new_annotations, constraints=["dataset", "post_id", "label"])
-
-        return len(new_annotations)
+        db_data = self.data
+        m = db_data["metadata"] # To avoid circular reference error
+        db_data["metadata"] = json.dumps(m)
+        return self.db.upsert("annotations", data=db_data, constraints=["label", "dataset", "item_id"])
 
     def delete(self):
         """
@@ -229,10 +191,11 @@ def delete(self):
         return self.db.delete("annotations", {"id": self.id})
 
     @staticmethod
-    def delete_many(self, dataset_key=None, id=None, field_id=None):
+    def delete_many(db, dataset_key=None, id=None, field_id=None):
         """
         Deletes annotations for an entire dataset or by a list of (field) IDs.
 
+        :param db:              Database object.
         :param str dataset_key: A dataset key.
         :param li id:			A list or string of unique annotation IDs.
         :param li field_id:		A list or string of IDs for annotation fields.
@@ -250,7 +213,143 @@ def delete_many(self, dataset_key=None, id=None, field_id=None):
         if field_id:
             where["field_id"] = field_id
 
-        return self.db.delete("annotations", where)
+        return db.delete("annotations", where)
+
+    @staticmethod
+    def update_annotations_via_fields(dataset_key, old_fields: dict, new_fields: dict, db) -> int:
+        """
+        Updates annotations in the annotations table if the input fields
+        themselves have been changed, for instance if a dropdown label is renamed
+        or a field is deleted.
+
+        :param str  dataset_key:    The dataset key for which fields changed.
+        :param dict old_fields:	    Old annotation fields.
+        :param dict new_fields:	    New annotation fields; this should contain not just
+                                    the additions, but all fields, changed or otherwise.
+        :param db:                  Database object so we can write.
+
+        :returns int:               How many records were affected.
+        """
+
+        text_fields = ["textarea", "text"]
+
+        # If old and new fields are identical, do nothing.
+        if old_fields == new_fields:
+            return 0
+
+        fields_to_delete = set()  # Delete all annotations with this field ID
+        fields_to_update = {}  # Update values of annotations with this field ID
+
+        # Loop through the old annotation fields
+        for old_field_id, old_field in old_fields.items():
+
+            # Delete all annotations of this type if the field is deleted.
+            if old_field_id not in new_fields:
+                fields_to_delete.add(old_field_id)
+                continue
+
+            field_id = old_field_id
+            new_field = new_fields[field_id]
+
+            # If the annotation type has changed, also delete existing annotations,
+            # except between text and textarea, where we can just change the type and keep the text.
+            if old_field["type"] != new_field["type"]:
+                if not old_field["type"] in text_fields and not new_field["type"] in text_fields:
+                    fields_to_delete.add(field_id)
+                    continue
+
+            # Loop through all the key/values in the new field settings
+            # and update in case it's different from the old values.
+            update_data = {}
+            for field_key, field_value in new_field.items():
+
+                # Update if values don't match
+                if field_value != old_field.get(field_key):
+
+                    # Special case: option values that are removed/renamed.
+                    # Here we may have to change/delete values within the
+                    # values column.
+                    if field_key == "options":
+
+                        new_options = field_value
+
+                        # Edge case: delete annotations of this type if all option fields are deleted
+                        if not new_options:
+                            fields_to_delete.add(field_id)
+                            continue
+
+                        old_options = old_field["options"]
+                        options_to_update = {}
+
+                        # Options are saved in a dict with IDs as keys and labels as values.
+                        for old_option_id, old_option in old_options.items():
+                            # Renamed option label
+                            if old_option_id in new_options and old_option != new_options[old_option_id]:
+                                options_to_update[old_option] = new_options[old_option_id]  # Old label -> new label
+                            # Deleted option
+                            elif old_option_id not in new_options:
+                                options_to_update[old_option] = None  # Remove None labels later
+
+                        if options_to_update:
+                            update_data[field_key] = {"options": options_to_update}
+
+                    # For all other changes, just overwrite with new data.
+                    else:
+                        update_data[field_key] = field_value
+
+            if update_data:
+                fields_to_update[field_id] = update_data
+
+        # Delete annotations
+        if fields_to_delete:
+            Annotation.delete_many(db, field_id=list(fields_to_delete))
+
+        # Write changes to fields to database
+        count = 0
+        if fields_to_update:
+            for field_id, updates in fields_to_update.items():
+
+                # Write to db
+                for column, update_value in updates.items():
+
+                    # Change values of columns
+                    updates = db.update("annotations", {column: update_value},
+                                        where={"dataset": dataset_key, "field_id": field_id})
+                    count += updates
+
+                    # Special case: Changed options.
+                    # Here we have to also rename/remove inserted options from the values column.
+                    if column == "options":
+
+                        inserted_options = db.fetchall("SELECT id, value FROM annotations "
+                                                      "WHERE dataset = %s and field_id = %s" % (dataset_key, field_id))
+                        new_inserts = []
+                        for inserted_option in inserted_options:
+
+                            annotation_id = inserted_option["id"]
+                            inserted_option = inserted_option["value"]
+
+                            if not inserted_option:
+                                continue
+
+                            # Remove or rename options
+                            new_values = []
+                            for inserted_option in inserted_options:
+                                if inserted_option in update_value:
+                                    if update_value[inserted_option] == None:
+                                        # Don't add
+                                        continue
+                                    elif inserted_option in update_value:
+                                        # Replace with new value
+                                        new_values.append(update_value[inserted_option])
+                                    else:
+                                        # Keep old value
+                                        new_values.append(inserted_option)
+
+                            new_values = ",".join(new_values)
+                            db.update("annotations", {"value": new_values}, where={"id": annotation_id})
+
+        return count
 
     def __getattr__(self, attr):
         """
diff --git a/common/lib/database.py b/common/lib/database.py
index 9166dab4f..eb69a0d2f 100644
--- a/common/lib/database.py
+++ b/common/lib/database.py
@@ -105,8 +105,8 @@ def update(self, table, data, where=None, commit=True):
 		Update a database record
 
 		:param string table:  Table to update
-		:param dict where:  Simple conditions, parsed as "column1 = value1 AND column2 = value2" etc
 		:param dict data:  Data to set, Column => Value
+		:param dict where:  Simple conditions, parsed as "column1 = value1 AND column2 = value2" etc
 		:param bool commit:  Whether to commit after executing the query
 
 		:return int: Number of affected rows. Note that this may be unreliable if `commit` is `False`
diff --git a/common/lib/dataset.py b/common/lib/dataset.py
index 223c4cd77..c377f37b5 100644
--- a/common/lib/dataset.py
+++ b/common/lib/dataset.py
@@ -334,14 +334,10 @@ def iterate_items(self, processor=None, warn_unmappable=True, map_missing="defau
 		if own_processor and own_processor.map_item_method_available(dataset=self):
 			item_mapper = True
 
-		# Annotations and annotation fields are dynamically added to top-level dataset
-		# and we're handling as 'extra' map_item fields.
-		annotation_fields = None
-		annotations = None
-		if self.is_top_dataset():
-			annotation_fields = self.get_annotation_fields()	
-		if annotation_fields:
-			annotations = self.get_annotations()
+		# Annotations are dynamically added
+		# and we're handling them as 'extra' map_item fields.
+		has_annotations = self.has_annotations()
+		annotation_labels = self.get_annotation_field_labels()
 
 		# missing field strategy can be for all fields at once, or per field
 		# if it is per field, it is a dictionary with field names and their strategy
@@ -387,27 +383,30 @@ def iterate_items(self, processor=None, warn_unmappable=True, map_missing="defau
 				mapped_item = original_item
 
 			# Add possible annotations
-			if annotation_fields:
-				for annotation_field in annotation_fields.values():
+			if has_annotations:
+
+				# Get the annotation, if available.
+				post_annotations = self.get_annotations(item_ids=[mapped_item.data["id"]])
+
+				# We're always handling annotated data as a MappedItem object,
+				# even if no map_item() function is available for the data source.
+				if not isinstance(mapped_item, MappedItem):
+					mapped_item = MappedItem(mapped_item)
+
+				for annotation_label in annotation_labels:
 
 					annotation = ""
-					annotation_label = annotation_field["label"]
-
-					# Get the annotation, if available.
-					# We're always handling annotated data as a MappedItem object,
-					# even if no map_item() function is available for the data source.
-					if not isinstance(mapped_item, MappedItem):
-						mapped_item = MappedItem(mapped_item)
-					
-					if annotations and mapped_item.data.get("id", "") in annotations:
-						annotation = annotations[mapped_item.data["id"]].get(annotation_label, "")
-						if isinstance(annotation, list):
-							annotation = ",".join(annotation)
+
+					for post_annotation in post_annotations:
+						if post_annotation.label == annotation_label:
+							annotation = post_annotation.value
+							if isinstance(annotation, list):
+								annotation = ",".join(annotation)
 
 					# We're always adding an annotation value,
 					# as an empty string if it's absent.
 					mapped_item.data[annotation_label] = annotation
-						
+
 			# yield a DatasetItem, which is a dict with some special properties
 			yield DatasetItem(mapper=item_mapper, original=original_item, mapped_object=mapped_item, **(mapped_item.get_item_data() if type(mapped_item) is MappedItem else mapped_item))
 
@@ -658,7 +657,7 @@ def get_owners_users(self, role="owner"):
 
 		# owners that are owner by being part of a tag
 		owners.extend(itertools.chain(*[tagged_owners for tag, tagged_owners in self.tagged_owners.items() if
-									   role is None or self.owners[f"tag:{tag}"]["role"] == role]))
+										role is None or self.owners[f"tag:{tag}"]["role"] == role]))
 
 		# de-duplicate before returning
 		return set(owners)
@@ -1519,7 +1518,7 @@ def file_exists(self):
 
 		if self.get_results_path().exists():
 			return True
-		
+
 		return False
 
 	def get_extension(self):
@@ -1545,7 +1544,7 @@ def get_result_url(self):
 		"""
 		filename = self.get_results_path().name
 		url_to_file = ('https://' if config.get("flask.https") else 'http://') + \
-						config.get("flask.server_name") + '/result/' + filename
+					  config.get("flask.server_name") + '/result/' + filename
 		return url_to_file
 
 	def warn_unmappable_item(self, item_count, processor=None, error_message=None, warn_admins=True):
@@ -1572,31 +1571,40 @@ def warn_unmappable_item(self, item_count, processor=None, error_message=None, w
 				# No other log available
 				raise DataSetException(f"Unable to map item {item_count} for dataset {closest_dataset.key} and properly warn")
 
-	# Annotation features
-	def get_annotations(self):
+	# Annotation functions (most of it is handled in Annotations)
+	def has_annotations(self) -> bool:
+		""" 
+		Whether this dataset has annotations
+		"""
+
+		annotation = self.db.fetchone("SELECT * FROM annotations WHERE dataset = %s;", (self.key,))
+
+		return True if annotation else False
+
+	def get_annotations(self, item_ids=[]) -> list:
 		"""
 		Retrieves the annotations for this dataset.
+
+		:param item_ids:	A list of item IDs to get the annotations from.
+							If empty, get all the annotations for this dataset.
+
 		return list: All annotations, each in their own dictionary.
 		"""
-		annotations = self.db.fetchall("SELECT * FROM annotations WHERE dataset = %s;", (self.key,))
+
+		if item_ids:
+			annotations = self.db.fetchall("SELECT * FROM annotations "
+										   "WHERE dataset = %s AND item_id IN %s;", (self.key, tuple(item_ids)))
+		else:
+			annotations = self.db.fetchall("SELECT * FROM annotations WHERE dataset = %s;", (self.key,))
 
 		if not annotations:
-			annotations = None
+			annotations = []
 		else:
-			annotations = [Annotation(data=annotation, dataset=self) for annotation in annotations]
+			annotations = [Annotation(data=annotation, db=self.db) for annotation in annotations]
 
 		return annotations
 
-	def has_annotations(self):
-		""" 
-		Returns True if there's one or more annotations found
-		"""
-
-		annotation = self.db.fetchone("SELECT * FROM annotations WHERE dataset = %s;", (self.key,))
-
-		return True if annotation else False
-
-	def has_annotation_fields(self):
+	def has_annotation_fields(self) -> bool:
 		""" 
 		Returns True if there's annotation fields saved tot the dataset table
 		"""
@@ -1605,21 +1613,7 @@ def has_annotation_fields(self):
 
 		return True if annotation_fields else False
 
-	def make_annotations(self, annotations):
-		"""
-		Generates a list of annotation objects from annotation JSON.
-		:param annotations: A list of dicts or JSON string with annotations 
-		"""
-		if not annotations:
-			return None
-		if isinstance(annotations, str):
-			annotaitons = json.loads(annotations)
-
-		annotations = [Annotation(annotation, self) for annotation in annotations]
-
-		return annotations
-
-	def get_annotation_fields(self):
+	def get_annotation_fields(self) -> dict:
 		"""
 		Retrieves the saved annotation fields for this dataset.
 		These are stored in the annotations table.
@@ -1632,200 +1626,35 @@ def get_annotation_fields(self):
 		if annotation_fields and annotation_fields.get("annotation_fields"):
 			annotation_fields = json.loads(annotation_fields["annotation_fields"])
 		else:
-			annotation_fields = None
+			annotation_fields = {}
 
 		return annotation_fields
 
-	def save_annotation_fields(self, new_fields, add=False):
+	def get_annotation_field_labels(self) -> list:
 		"""
-		Save annotation field data to the datasets table (in the `annotation_fields` column).
-		If changes to the annotation fields affect existing annotations,
-		this function will also call `update_annotations_via_fields()` to change them.
-
-		:param dict new_fields:  		New annotation fields, with a field ID as key.
-
-		:param bool add:				Whether we're merely adding new fields
-										or replacing the whole batch. If add is false,
-										`new_fields` should contain all fields.
-
-		:return int:					The number of annotation fields saved.
-
-		"""
-
-		# Get existing annotation fields to see if stuff changed.
-		old_fields = self.get_annotation_fields()
-		changes = False
-
-		# Do some validation
-		# Annotation field must be valid JSON.
-		try:
-			json.dumps(new_fields)
-		except ValueError:
-			raise AnnotationException("Can't save annotation fields: not valid JSON (%s)" % new_fields)
-
-		# Annotation fields must at minimum have `type` and `label` keys.
-		for field_id, annotation_field in new_fields.items():
-			if not isinstance(field_id, str):
-				raise AnnotationException("Can't save annotation fields: field ID %s is not a valid string" % field_id)
-			if not "label" in annotation_field:
-				raise AnnotationException("Can't save annotation fields: all fields must have a label" % field_id)
-			if not "type" in annotation_field:
-				raise AnnotationException("Can't save annotation fields: all fields must have a type" % field_id)
-
-			# Keep track of whether existing fields have changed; if so, we're going to
-			# update the annotations table.
-			if field_id in old_fields:
-				if old_fields[field_id] != annotation_field:
-					changes = True
-
-		# If we're just adding fields, add them to the old fields
-		# If the field already exists, overwrite the old field.
-		if add and old_fields:
-			all_fields = old_fields
-			for field_id, annotation_field in new_fields.items():
-				all_fields[field_id] = annotation_field
-			new_fields = all_fields
-
-		# We're saving the new annotation fields as-is.
-		# Ordering of fields is preserved this way.
-		self.db.execute("UPDATE datasets SET annotation_fields = %s WHERE key = %s;", (json.dumps(new_fields), self.key))
-
-		# If we're adding but the field already exists, update/delete annotations with that ID.
-		add_and_overlap = add and any([True for k in list(new_fields.keys()) if k in old_fields])
-
-		if changes or add_and_overlap:
-			self.update_annotations_via_fields(old_fields, new_fields)
-
-		return len(new_fields)
+		Retrieves the saved annotation field labels for this dataset.
+		These are stored in the annotations table.
 
-	def update_annotations_via_fields(self, old_fields, new_fields):
+		:return list: List of annotation field labels.
 		"""
-		Updates annotations in the annotations table if the input field
-		itself has been changed, for instance if a dropdown label is renamed.
-	
-		:param di old_fields:	Old annotation fields
-		:param di new_fields:	New annotation fields; this should contain not just
-								additions, but all fields, changed or otherwise.
-		
-		"""
-
-		text_fields = ["textarea", "text"]
-		
-		# If old and new fields are identical, do nothing.
-		if old_fields == new_fields:
-			return
-
-		# Only update annotations if they, in fact, exist.
-		annotations = self.get_annotations()
-		if not annotations:
-			return
-
-		fields_to_delete = set()	# Delete all annotations with this field ID
-		fields_to_update = {} 		# Update values of annotations with this field ID
 
-		# Loop through the old annotation fields
-		for old_field_id, old_field in old_fields.items():
+		annotation_fields = self.get_annotation_fields()
 
-			# Delete all annotations of this type if the field is deleted.
-			if old_field_id not in new_fields:
-				fields_to_delete.add(old_field_id)
-				continue
+		if not annotation_fields:
+			return []
 
-			field_id = old_field_id
-			new_field = new_fields[field_id]
+		labels = [v["label"] for v in annotation_fields.values()]
 
-			# If the annotation type has changed, also delete existing annotations,
-			# except between text and textarea, where we can just change the type and keep the text.
-			if old_field["type"] != new_field["type"]:
-				if not old_field["type"] in text_fields and not new_field["type"] in text_fields:
-					fields_to_delete.add(field_id)
-					continue
+		return labels
 
-			# Loop through all the key/values in the new field data
-			# and update in case it's different from the old values.
-			update_data = {}
-			for field_key, field_value in new_field.items():
-
-				# Update if values don't match
-				if field_value != old_field.get(field_key):
-
-					# Special case: option values that are removed/renamed.
-					# Here we only have to change specific values within the 
-					# values column.
-					if field_key == "options":
-
-						new_options = field_value
-						# Delete annotations of this type if all option fields are deleted
-						# (even though this should not be possible in the Explorer front-end)
-						if not new_options:
-							fields_to_delete.add(field_id)
-							continue
-
-						old_options = old_field["options"]
-						
-						options_to_update = {}
-
-						# Options are saved in a dict with IDs and labels as keys/values.
-						for old_option_id, old_option in old_options.items():
-							# Renamed option label
-							if old_option_id in new_options and old_option != new_options[old_option_id]:
-								options_to_update[old_option] = new_options[old_option_id] # Old label -> new label
-							# Deleted option
-							elif old_option_id not in new_options:
-								options_to_update[old_option] = None # Remove None labels
-
-						if options_to_update:
-							update_data[field_key] = {}
-							update_data[field_key]["options"] = options_to_update
-
-					# For all other changes, just overwrite with new data.
-					else:
-						update_data[field_key] = field_value
-
-			if update_data:
-				fields_to_update[field_id] = update_data
-
-		# Delete annotations
-		if fields_to_delete:
-			self.delete_annotations(field_id=list(fields_to_delete))
-
-		# Change annotations based on changes in update fields
-		if fields_to_update:
-			new_annotations = []
-			for annotation in annotations:
-				if annotation.field_id in fields_to_update:
-					for k, update_field in fields_to_update[annotation["field_id"]]:
-
-						# Special case: Changed options
-						if k == "options":
-							new_values = []
-							for inserted_option in annotations["value"].split(","):
-								if inserted_option in update_field:
-									if update_field[inserted_option] == None:
-										# Don't add
-										continue
-									elif inserted_option in update_field:
-										# Replace with new value
-										new_values.append(update_field[inserted_option])
-									else:
-										# Keep old value
-										new_values.append(inserted_option)
-
-							update_field = new_values
-
-						annotation[k] = update_field
-
-				new_annotations.append(annotation)
-
-			# Save updated annotations
-			self.save_annotations(new_annotations)
-
-	def save_annotations(self, annotations, overwrite=True):
+	def save_annotations(self, annotations: list, overwrite=True) -> int:
 		"""
 		Takes a list of annotations and saves them to the annotations table.
-		If a field is not yet present in the datasets table, it also adds it there.
+		If a field is not yet present in the `annotation_fields` column in
+		the datasets table, it also adds it there.
 
-		:param list annotations:		List of dictionaries with annotation items.
+		:param list annotations:		List of dictionaries with annotation items. Must have `item_id` and `label`.
+										E.g. [{"item_id": "12345", "label": "Valid", "value": "Yes"}]
 		:param bool overwrite:			Whether to overwrite annotation if the label is already present
 										for the dataset.
 
@@ -1836,17 +1665,63 @@ def save_annotations(self, annotations, overwrite=True):
 		if not annotations:
 			return 0
 
+		count = 0
+		annotation_fields = self.get_annotation_fields()
+		annotation_labels = self.get_annotation_field_labels()
+		known_field_ids = {} # Just so we don't have to hash every annotation without a field ID
+
 		# Add some dataset data to annotations, if not present
 		for annotation in annotations:
+
+			# Check if the required fields are present
+			if "item_id" not in annotation:
+				raise AnnotationException("Can't save annotations; annotation must have an `item_id` referencing "
+										  "the item they annotated, got %s" % annotation)
+			if "label" not in annotation or not isinstance(annotation["label"], str):
+				raise AnnotationException("Can't save annotations; annotation must have a `label` field, "
+										  "got %s" % annotation)
+			if not overwrite and annotation["label"] in annotation_labels:
+				raise AnnotationException("Can't save annotations; annotation field with label %s "
+										  "already exists" % annotation["label"])
+
 			# Set dataset key
 			if not annotation.get("dataset"):
 				annotation["dataset"] = self.key
+
+			# If not present, add an ID for this annotation field, based on the dataset key and label
+			if "field_id" not in annotation:
+				field_id_str = annotation["label"] + annotation["dataset"]
+				# Check if we hashed this before
+				if field_id_str in known_field_ids:
+					field_id = known_field_ids[field_id_str]
+				else:
+					field_id = hashlib.md5(field_id_str.encode("utf-8")).hexdigest()
+				annotation["field_id"] = field_id
+
 			# Set default author to this dataset owner
+			# If this annotation is made by a processor, it will have the processor name
 			if not annotation.get("author"):
 				annotation["author"] = self.get_owners()[0]
 
-			# Create Annotation object, which saves it to the database
+			# Add data on the type of annotation field, if it is not saved to the datasets table yet.
+			# For now this is just a simple dict with a field ID, type, label, and possible options.
+			if not annotation_fields or annotation["field_id"] not in annotation_fields:
+				annotation_fields[annotation["field_id"]] = {
+					"label": annotation["label"],
+					"type": annotation.get("type", "text") # Default to text
+				}
+				if "options" in annotation:
+					annotation_fields[annotation["field_id"]]["options"] = annotation["options"]
+
+			# Create Annotation object, which also saves it to the database
 			Annotation(data=annotation, db=self.db)
+			count += 1
+
+		# Save annotation fields if things changed
+		if annotation_fields != self.get_annotation_fields():
+			self.save_annotation_fields(annotation_fields)
+
+		return count
 
 	def delete_annotations(self, dataset_key=None, id=None, field_id=None):
 		"""
@@ -1857,7 +1732,7 @@ def delete_annotations(self, dataset_key=None, id=None, field_id=None):
 		:param li field_id:		A list or string of IDs for annotation fields.
 
 		:return int: The number of removed records.
-		"""
+	   """
 
 		if not dataset_key and not id and not field_id:
 			return 0
@@ -1872,6 +1747,67 @@ def delete_annotations(self, dataset_key=None, id=None, field_id=None):
 
 		return self.db.delete("annotations", where)
 
+	def save_annotation_fields(self, new_fields: dict, add=False) -> int:
+		"""
+		Save annotation field data to the datasets table (in the `annotation_fields` column).
+		If changes to the annotation fields affect existing annotations,
+		this function will also call `update_annotations_via_fields()` to change them.
+
+		:param dict new_fields:  		New annotation fields, with a field ID as key.
+
+		:param bool add:				Whether we're merely adding new fields
+										or replacing the whole batch. If add is False,
+										`new_fields` should contain all fields.
+
+		:return int:					The number of annotation fields saved.
+
+		"""
+
+		# Get existing annotation fields to see if stuff changed.
+		old_fields = self.get_annotation_fields()
+		changes = False
+
+		# Do some validation
+		# Annotation field must be valid JSON.
+		try:
+			json.dumps(new_fields)
+		except ValueError:
+			raise AnnotationException("Can't save annotation fields: not valid JSON (%s)" % new_fields)
+
+		# Annotation fields must at minimum have `type` and `label` keys.
+		for field_id, annotation_field in new_fields.items():
+			if not isinstance(field_id, str):
+				raise AnnotationException("Can't save annotation fields: field ID %s is not a valid string" % field_id)
+			if "label" not in annotation_field:
+				raise AnnotationException("Can't save annotation fields: all fields must have a label" % field_id)
+			if "type" not in annotation_field:
+				raise AnnotationException("Can't save annotation fields: all fields must have a type" % field_id)
+
+			# Keep track of whether existing fields have changed; if so, we're going to
+			# update the annotations table.
+			if field_id in old_fields:
+				if old_fields[field_id] != annotation_field:
+					changes = True
+
+		# If we're just adding fields, add them to the old fields.
+		# If the field already exists, overwrite the old field.
+		if add and old_fields:
+			all_fields = old_fields
+			for field_id, annotation_field in new_fields.items():
+				all_fields[field_id] = annotation_field
+			new_fields = all_fields
+
+		# We're saving the new annotation fields as-is.
+		# Ordering of fields is preserved this way.
+		self.db.execute("UPDATE datasets SET annotation_fields = %s WHERE key = %s;", (json.dumps(new_fields), self.key))
+
+		# If anything changed with the annotation fields, possibly update
+		# existing annotations (e.g. to delete them or change their labels).
+		if changes:
+			Annotation.update_annotations_via_fields(self.key, old_fields, new_fields, self.db)
+
+		return len(new_fields)
+
 	def __getattr__(self, attr):
 		"""
 		Getter so we don't have to use .data all the time
diff --git a/helper-scripts/migrate/migrate-1.45-1.46.py b/helper-scripts/migrate/migrate-1.45-1.46.py
index e0248cfcf..c127de266 100644
--- a/helper-scripts/migrate/migrate-1.45-1.46.py
+++ b/helper-scripts/migrate/migrate-1.45-1.46.py
@@ -37,8 +37,8 @@
     for field_id, annotation_field in annotation_fields.items():
 
         if "options" in annotation_field:
-
             flattened_options = {}
+
             if isinstance(annotation_field["options"], list):
                 for op in annotation_field["options"]:
                     flattened_options.update(op)
@@ -55,9 +55,9 @@
 db.execute("""
 CREATE TABLE IF NOT EXISTS annotations_new (
   id                SERIAL PRIMARY KEY,
-  field_id          SERIAL,
-  post_id           TEXT,
   dataset           TEXT,
+  field_id          SERIAL,
+  item_id           TEXT,
   timestamp         INT DEFAULT 0,
   timestamp_created INT DEFAULT 0,
   label             TEXT,
@@ -80,7 +80,7 @@
   ON annotations_new (
     label,
     dataset,
-    post_id
+    item_id
 );
 CREATE INDEX IF NOT EXISTS annotation_value
   ON annotations_new (
@@ -104,7 +104,7 @@
     count = 0
     skipped_count = 0
 
-    columns = "post_id,field_id,dataset,timestamp,timestamp_created,label,type,options,value,author,by_processor,metadata"
+    columns = "dataset,field_id,item_id,timestamp,timestamp_created,label,type,options,value,author,by_processor,metadata"
 
     # Each row are **all** annotations per dataset
     for row in annotations:
diff --git a/processors/metrics/count_posts.py b/processors/metrics/count_posts.py
index ea1ef48f5..af32ed565 100644
--- a/processors/metrics/count_posts.py
+++ b/processors/metrics/count_posts.py
@@ -3,7 +3,6 @@
 """
 
 from common.lib.helpers import UserInput, pad_interval, get_interval_descriptor
-from common.lib.annotation import Annotation
 from backend.lib.processor import BasicProcessor
 
 __author__ = "Stijn Peeters"
@@ -60,7 +59,7 @@ def process(self):
 
 			for post in self.source_dataset.iterate_items(self):
 
-				annotation = Annotation(value="test", label="count_posts_test", dataset=self.source_dataset)
+				annotation = {"value": "test", "item_id": post["id"]}
 				annotations.append(annotation)
 
 				try:
@@ -153,6 +152,7 @@ def process(self):
 					row["value_relative"] = intervals[interval]["relative"]
 				rows.append(row)
 
+		self.write_annotations(annotations)
 		self.write_csv_items_and_finish(rows)
 
 	@classmethod

From 77213410f2a3ce130ceca3d0923bc7137159c5e2 Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Mon, 12 Aug 2024 13:08:57 +0200
Subject: [PATCH 122/204] fix: Bug in migrate

---
 helper-scripts/migrate/migrate-1.45-1.46.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/helper-scripts/migrate/migrate-1.45-1.46.py b/helper-scripts/migrate/migrate-1.45-1.46.py
index c127de266..5e8f0d979 100644
--- a/helper-scripts/migrate/migrate-1.45-1.46.py
+++ b/helper-scripts/migrate/migrate-1.45-1.46.py
@@ -110,7 +110,6 @@
     for row in annotations:
 
         dataset = db.fetchone("SELECT * FROM datasets WHERE key = '" + row["dataset"] + "';")
-        
         # If the dataset is not present anymore,
         # we're going to skip these annotations;
         # likely the dataset is expired.
@@ -119,7 +118,11 @@
             skipped_count += 1
             continue
 
-        annotation_fields = json.loads(dataset["annotation_fields"])
+        annotation_fields = dataset["annotation_fields"]
+        if annotation_fields:
+            annotation_fields = json.loads(dataset.get("annotation_fields"))
+        else: annotation_fields = {}
+
         author = dataset.get("creator", "")
 
         if not row.get("annotations"):

From 22f6ea2fbea33aa1eeb7c76564d2a4290791fd4d Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Mon, 12 Aug 2024 15:45:03 +0200
Subject: [PATCH 123/204] Fixes in migrate script

---
 helper-scripts/migrate/migrate-1.45-1.46.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/helper-scripts/migrate/migrate-1.45-1.46.py b/helper-scripts/migrate/migrate-1.45-1.46.py
index 5e8f0d979..d45523716 100644
--- a/helper-scripts/migrate/migrate-1.45-1.46.py
+++ b/helper-scripts/migrate/migrate-1.45-1.46.py
@@ -56,7 +56,7 @@
 CREATE TABLE IF NOT EXISTS annotations_new (
   id                SERIAL PRIMARY KEY,
   dataset           TEXT,
-  field_id          SERIAL,
+  field_id          TEXT,
   item_id           TEXT,
   timestamp         INT DEFAULT 0,
   timestamp_created INT DEFAULT 0,
@@ -104,7 +104,7 @@
     count = 0
     skipped_count = 0
 
-    columns = "dataset,field_id,item_id,timestamp,timestamp_created,label,type,options,value,author,by_processor,metadata"
+    columns = "id,dataset,field_id,item_id,timestamp,timestamp_created,label,type,options,value,author,by_processor,metadata"
 
     # Each row are **all** annotations per dataset
     for row in annotations:
@@ -156,9 +156,9 @@
                     value = ",".join(value)
 
                 inserts = [(
-                    str(post_id),           # post_id; needs to be a string, changes per data source.
+                    row["dataset"],         # dataset
                     int(field_id),          # field_id; this is an ID for the same type of input field.
-                    row["dataset"],             # dataset
+                    str(post_id),           # post_id; needs to be a string, changes per data source.
                     dataset["timestamp"],   # timestamp
                     dataset["timestamp"],   # timestamp_created
                     label,                  # label

From e5cce4905894557ed018549aa0d64f0b56022909 Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Mon, 12 Aug 2024 15:45:23 +0200
Subject: [PATCH 124/204] Improve Annotation() and make map_item() fetch
 annotation values

---
 common/lib/annotation.py | 23 +++++++++++--------
 common/lib/dataset.py    | 49 ++++++++++++++++++++++------------------
 2 files changed, 41 insertions(+), 31 deletions(-)

diff --git a/common/lib/annotation.py b/common/lib/annotation.py
index b871006a9..7da8a01a8 100644
--- a/common/lib/annotation.py
+++ b/common/lib/annotation.py
@@ -23,6 +23,7 @@ class Annotation:
     data = None
     db = None
 
+
     id = None                 # Unique ID for this annotation
     item_id = None            # ID of the item for this annotation, e.g. post ID
     field_id = None           # If of this type of annotation field for this dataset
@@ -30,6 +31,7 @@ class Annotation:
     timestamp = None          # When this annotation was edited
     timestamp_created = None  # When this timestamp was created
     label = None              # Label of annotation
+    type = None               # Type of annotation (e.g. `text`)
     options = None            # Possible options
     value = None              # The actual annotation value
     author = None             # Who made the annotation
@@ -64,7 +66,7 @@ def __init__(self, data=None, id=None, db=None):
         # an ID, it is guaranteed to be in the database.
         # IDs can both be explicitly given or present in the data dict.
         if id is not None or "id" in data:
-            if "id" in data:
+            if data and "id" in data:
                 id = data["id"]
             self.id = id # IDs correspond to unique serial numbers in the database.
             current = self.db.fetchone("SELECT * FROM annotations WHERE id = %s" % (self.id))
@@ -105,9 +107,9 @@ def __init__(self, data=None, id=None, db=None):
             created_timestamp = int(time.time())
 
             new_data = {
+                "dataset": data["dataset"],
                 "item_id": data["item_id"],
                 "field_id": data["field_id"] if data.get("field_id") else self.get_field_id(data["dataset"], data["label"]),
-                "dataset": data["dataset"],
                 "timestamp": 0,
                 "timestamp_created": created_timestamp,
                 "label": data["label"],
@@ -122,17 +124,20 @@ def __init__(self, data=None, id=None, db=None):
             self.data = new_data
             new_or_updated = True
 
+        for k, v in self.data.items():
+            self.__setattr__(k, v)
+
         # Write to db if anything changed
         if new_or_updated:
-            self.data["timestamp"] = int(time.time())
-            print(self.data)
+            self.timestamp = int(time.time())
             self.write_to_db()
 
-    def get_by_id(self, id: int):
+    def get_by_id(id: int, db):
         """
         Get annotation by ID
 
         :param str id:  ID of annotation
+        :param db:      Database connection object
         :return:  Annotation object, or `None` for invalid annotation ID
         """
 
@@ -141,7 +146,7 @@ def get_by_id(self, id: int):
         except ValueError:
             raise AnnotationException("Id '%s' is not valid" % id)
 
-        return Annotation(id=id)
+        return Annotation(id=id, db=db)
 
     def get_by_field(self, dataset_key: str, item_id: str, label: str) -> dict:
         """
@@ -386,9 +391,9 @@ def __setattr__(self, attr, value):
             return
 
         if attr not in self.data:
-            self.parameters[attr] = value
+            self.metadata[attr] = value
             attr = "metadata"
-            value = self.parameters
+            value = self.metadata
 
         if attr == "metadata":
             value = json.dumps(value)
@@ -398,4 +403,4 @@ def __setattr__(self, attr, value):
         self.data[attr] = value
 
         if attr == "metadata":
-            self.parameters = json.loads(value)
\ No newline at end of file
+            self.metadata = json.loads(value)
\ No newline at end of file
diff --git a/common/lib/dataset.py b/common/lib/dataset.py
index c377f37b5..364fa167b 100644
--- a/common/lib/dataset.py
+++ b/common/lib/dataset.py
@@ -385,8 +385,8 @@ def iterate_items(self, processor=None, warn_unmappable=True, map_missing="defau
 			# Add possible annotations
 			if has_annotations:
 
-				# Get the annotation, if available.
-				post_annotations = self.get_annotations(item_ids=[mapped_item.data["id"]])
+				# Get annotations for this specific post
+				post_annotations = self.get_annotations(item_id=mapped_item.data["id"])
 
 				# We're always handling annotated data as a MappedItem object,
 				# even if no map_item() function is available for the data source.
@@ -394,18 +394,16 @@ def iterate_items(self, processor=None, warn_unmappable=True, map_missing="defau
 					mapped_item = MappedItem(mapped_item)
 
 				for annotation_label in annotation_labels:
-
-					annotation = ""
-
+					value = ""
 					for post_annotation in post_annotations:
 						if post_annotation.label == annotation_label:
-							annotation = post_annotation.value
-							if isinstance(annotation, list):
-								annotation = ",".join(annotation)
+							value = post_annotation.value
+							if isinstance(value, list):
+								value = ",".join(value)
 
-					# We're always adding an annotation value,
-					# as an empty string if it's absent.
-					mapped_item.data[annotation_label] = annotation
+					# We're always adding an annotation value
+					# as an empty string, even if it's absent.
+					mapped_item.data[annotation_label] = value
 
 			# yield a DatasetItem, which is a dict with some special properties
 			yield DatasetItem(mapper=item_mapper, original=original_item, mapped_object=mapped_item, **(mapped_item.get_item_data() if type(mapped_item) is MappedItem else mapped_item))
@@ -1581,26 +1579,33 @@ def has_annotations(self) -> bool:
 
 		return True if annotation else False
 
-	def get_annotations(self, item_ids=[]) -> list:
+	def get_annotations(self, item_id=[]) -> list:
 		"""
 		Retrieves the annotations for this dataset.
 
-		:param item_ids:	A list of item IDs to get the annotations from.
-							If empty, get all the annotations for this dataset.
+		:param item_id:	A list of item IDs to get the annotations from.
+						If empty, get all the annotations for this dataset.
+						May also be a string to get annotations from a specific item.
 
 		return list: All annotations, each in their own dictionary.
 		"""
 
-		if item_ids:
-			annotations = self.db.fetchall("SELECT * FROM annotations "
-										   "WHERE dataset = %s AND item_id IN %s;", (self.key, tuple(item_ids)))
+		annotations = []
+		if item_id:
+			if isinstance(item_id, str):
+				item_id = [item_id]
+			ids = self.db.fetchall("SELECT id FROM annotations WHERE dataset = %s AND item_id IN %s;",
+								   (self.key, tuple(item_id)))
 		else:
-			annotations = self.db.fetchall("SELECT * FROM annotations WHERE dataset = %s;", (self.key,))
+			ids = self.db.fetchall("SELECT id FROM annotations WHERE dataset = %s;", (self.key,))
 
-		if not annotations:
-			annotations = []
-		else:
-			annotations = [Annotation(data=annotation, db=self.db) for annotation in annotations]
+		if not ids:
+			return []
+
+		ids = [i["id"] for i in ids]
+
+		for id in ids:
+			annotations.append(Annotation.get_by_id(id, self.db))
 
 		return annotations
 

From be8ac89c8323fffbb258119ed506d7f7fc1484f2 Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Mon, 12 Aug 2024 17:13:28 +0200
Subject: [PATCH 125/204] First steps to make new annotation system work with
 Explorer

---
 backend/lib/processor.py                      | 14 ++++++++-
 common/lib/annotation.py                      | 12 ++++----
 common/lib/dataset.py                         |  3 +-
 webtool/templates/explorer/explorer.html      |  2 ++
 .../templates/explorer/post-annotations.html  | 29 +++++++++++--------
 webtool/templates/explorer/post.html          |  3 --
 webtool/views/views_explorer.py               | 13 ++++++---
 7 files changed, 50 insertions(+), 26 deletions(-)

diff --git a/backend/lib/processor.py b/backend/lib/processor.py
index 5a0aaff52..3ee7704d1 100644
--- a/backend/lib/processor.py
+++ b/backend/lib/processor.py
@@ -3,6 +3,7 @@
 """
 import re
 import traceback
+import hashlib
 import zipfile
 import typing
 import shutil
@@ -738,13 +739,16 @@ def write_annotations(self, annotations: list, source_dataset=None, overwrite=Fa
 		if not source_dataset:
 			source_dataset = self.source_dataset
 
+		# Create a field ID based on the
+
 		# Check if this dataset already has annotation fields
+		field_id = ""
 		existing_labels = source_dataset.get_annotation_field_labels()
 
 		# Set some values
 		for annotation in annotations:
 
-			# Set the default author and label to this processor's name
+			# Set the default label to this processor's name
 			if not annotation.get("label"):
 				# If the processor has already generated annotation fields,
 				# add a number to differentiate the label
@@ -752,6 +756,8 @@ def write_annotations(self, annotations: list, source_dataset=None, overwrite=Fa
 				if not overwrite and label in existing_labels:
 					label += "-" + str(len([l for l in existing_labels if l.startswith(label)]))
 				annotation["label"] = label
+
+			# Set the author to this processor's name
 			if not annotation.get("author"):
 				annotation["author"] = self.name
 
@@ -762,6 +768,12 @@ def write_annotations(self, annotations: list, source_dataset=None, overwrite=Fa
 				annotation["metadata"] = {}
 			annotation["metadata"]["processor-parameters"] = self.parameters
 
+			if not annotation.get("field_id"):
+				if not field_id:
+					field_id = source_dataset.key + annotation["label"]
+					field_id = hashlib.md5(field_id.encode("utf-8")).hexdigest()
+				annotation["field_id"] = field_id
+
 		annotations_saved = source_dataset.save_annotations(annotations, overwrite=overwrite)
 		return annotations_saved
 
diff --git a/common/lib/annotation.py b/common/lib/annotation.py
index 7da8a01a8..b36ae566c 100644
--- a/common/lib/annotation.py
+++ b/common/lib/annotation.py
@@ -109,7 +109,7 @@ def __init__(self, data=None, id=None, db=None):
             new_data = {
                 "dataset": data["dataset"],
                 "item_id": data["item_id"],
-                "field_id": data["field_id"] if data.get("field_id") else self.get_field_id(data["dataset"], data["label"]),
+                "field_id": data["field_id"] if data.get("field_id") else self.set_field_id(data["dataset"], data["label"]),
                 "timestamp": 0,
                 "timestamp_created": created_timestamp,
                 "label": data["label"],
@@ -161,14 +161,14 @@ def get_by_field(self, dataset_key: str, item_id: str, label: str) -> dict:
         """
 
         data = self.db.fetchone("SELECT * FROM annotations WHERE dataset = %s AND item_id = %s AND label = %s",
-                         (dataset_key, item_id, label))
+                         (dataset_key, str(item_id), label))
         if not data:
             return {}
 
         data["metadata"] = json.loads(data["metadata"])
         return data
 
-    def get_field_id(self, dataset_key: str, label: str) -> str:
+    def set_field_id(self, dataset_key: str, label: str) -> str:
         """
         Sets a `field_id` based on the dataset key and label.
         This combination should be unique.
@@ -176,9 +176,11 @@ def get_field_id(self, dataset_key: str, label: str) -> str:
         :param dataset_key: The dataset key
         :param label:       The label of the dataset.
         """
-        field_id = hashlib.md5(dataset_key + label.encode("utf-8")).hexdigest()
+
+        field_id = source_dataset.key + annotation["label"]
+        field_id = hashlib.md5(field_id.encode("utf-8")).hexdigest()
         self.field_id = field_id
-        return field_id
+        return self.field_id
 
     def write_to_db(self):
         """
diff --git a/common/lib/dataset.py b/common/lib/dataset.py
index 364fa167b..d1aeb9838 100644
--- a/common/lib/dataset.py
+++ b/common/lib/dataset.py
@@ -1592,8 +1592,9 @@ def get_annotations(self, item_id=[]) -> list:
 
 		annotations = []
 		if item_id:
-			if isinstance(item_id, str):
+			if isinstance(item_id, str) or isinstance(item_id, int):
 				item_id = [item_id]
+			item_id = [str(i) for i in item_id]
 			ids = self.db.fetchall("SELECT id FROM annotations WHERE dataset = %s AND item_id IN %s;",
 								   (self.key, tuple(item_id)))
 		else:
diff --git a/webtool/templates/explorer/explorer.html b/webtool/templates/explorer/explorer.html
index 137864b5e..eb31de870 100644
--- a/webtool/templates/explorer/explorer.html
+++ b/webtool/templates/explorer/explorer.html
@@ -26,6 +26,8 @@
 
 </script>
 
+<!-- Don't show certain data if the dataset is pseudonymised -->
+{% set pseudonymised = True if dataset.parameters and dataset.parameters.get('pseudonymise', False) %}
 
 {% set key = dataset.data.key %}
 
diff --git a/webtool/templates/explorer/post-annotations.html b/webtool/templates/explorer/post-annotations.html
index e6f4c731f..94b57b47a 100644
--- a/webtool/templates/explorer/post-annotations.html
+++ b/webtool/templates/explorer/post-annotations.html
@@ -1,26 +1,31 @@
 <div class="post-annotations">
 	
 	{% if annotation_fields %}
-		{% set old_annotations = None %}
 		{% if annotations and post.id in annotations %}
-			{% set old_annotations = annotations[post.id] %}
+			{% set post_annotations = annotations[post.id] %}
 		{% endif %}
 
 		{% for field in annotation_fields %}
 
-			{% set type = annotation_fields[field]["type"] %}
+            {% set type = annotation_fields[field]["type"] %}
 			{% set label = annotation_fields[field]["label"] %}
-			{% set old_annotation = "" %}
-			{% if old_annotations and label in old_annotations %}
-				{% set old_annotation = old_annotations[label] %}
-			{% endif %}
-			
+
+            {% set annotation = {} %}
+			{% for post_annotation in post_annotations %}
+                {{ field }}
+				{% if post_annotation.field_id == field %}
+                    {% set annotation = post_annotation %}
+                    {{ annotation }}
+    			{% endif %}
+            {% endfor %}
+            {{ annotation }}
+
 			<div class="post-annotation field-{{ field }} {{ type }}"><label class="annotation-label">{{ label }}</label>
 			{% if type == 'text' %}
-				<input type="text" class="post-annotation-input text-{{ field }}" value="{{ old_annotation }}">
+				<input type="text" class="post-annotation-input text-{{ field }}" value="{{ annotation.value }}">
 
 			{% elif type == 'textarea' %}
-				<textarea class="post-annotation-input textarea-{{ field }}">{{ old_annotation }}</textarea>
+				<textarea class="post-annotation-input textarea-{{ field }}">{{ annotation.value }}</textarea>
 
 			{% elif type == 'dropdown' %}
 				<select class="post-annotation-options select-{{ type }}" id="{{ field }}">
@@ -29,7 +34,7 @@
 				{% for option in annotation_fields[field]["options"] %}
 					{% set option_id = option.keys() | first %}
 					{% set option_label = option.values() | first %}
-					<option class="post-annotation-input option-{{ option_id }}" id="option-{{ option_id }}" value="{{ option_label }}" {% if option_label == old_annotation %}selected{% endif %}>{{ option_label }}</option>
+					<option class="post-annotation-input option-{{ option_id }}" id="option-{{ option_id }}" value="{{ option_label }}" {% if option_label == annotation %}selected{% endif %}>{{ option_label }}</option>
 				{% endfor %}
 				</select>
 
@@ -38,7 +43,7 @@
 				{% for option in annotation_fields[field]["options"] %}
 					{% set option_id = option.keys() | first %}
 					{% set option_label = option.values() | first %}
-					{% set checked = "checked" if old_annotation and option_label in old_annotation else "" %}
+					{% set checked = "checked" if option_label in annotation.value else "" %}
 
 					<input type='checkbox' class='post-annotation-input option-{{ option_id }}' id='option-{{ post.id }}-{{ option_id }}' value='{{ option_label }}' {{ checked }}><label for='option-{{ post.id }}-{{ option_id }}'>{{ option_label }}</label>
 				{% endfor %}
diff --git a/webtool/templates/explorer/post.html b/webtool/templates/explorer/post.html
index 52b045886..61074757b 100644
--- a/webtool/templates/explorer/post.html
+++ b/webtool/templates/explorer/post.html
@@ -1,8 +1,5 @@
 <li id="post-{{ post.id }}" class="post{% if post.thread_id == post.id %} op{% endif %}">    
 
-	<!-- Don't show certain data if the dataset is pseudonymised -->
-	{% set pseudonymised = True if dataset.parameters and dataset.parameters.get('pseudonymise', False) %}
-	
 	<!-- Data sources may have a custom template (Jinja2) -->
 	{% if template == "datasource" %}
 		{% include "explorer/datasource-templates/" + datasource + ".html" %}
diff --git a/webtool/views/views_explorer.py b/webtool/views/views_explorer.py
index a3acb82d3..b0ba04696 100644
--- a/webtool/views/views_explorer.py
+++ b/webtool/views/views_explorer.py
@@ -89,6 +89,9 @@ def explorer_dataset(key, page=1):
 	posts = []
 	count = 0
 
+	# Load annotations with post IDs as keys and their annotations as lists.
+	annotations = {}
+
 	# We don't need to sort if we're showing the existing dataset order (the default).
 	# If we're sorting, we need to iterate over the entire dataset first.
 	if not sort or (sort == "dataset-order" and reverse == False):
@@ -120,6 +123,12 @@ def explorer_dataset(key, page=1):
 	if not posts:
 		return error(404, error="No posts or posts could not be displayed")
 
+	# Check whether there's already annotations made.
+	# If so, also pass these to the template and set the post ID
+	# as key so we can easily retrieve them.
+	for post_id in post_ids:
+		annotations[post_id] = dataset.get_annotations(item_id=post_id)
+
 	# We can use either a generic or a pre-made data source-specific template.
 	template = "datasource" if has_datasource_template(datasource) else "generic"
 	if template == "generic":
@@ -130,10 +139,6 @@ def explorer_dataset(key, page=1):
 	with open(posts_css, "r", encoding="utf-8") as css:
 		posts_css = css.read()
 
-	# Check whether there's already annotations inserted already.
-	# If so, also pass these to the template.
-	annotations = dataset.get_annotations()
-	
 	# Generate the HTML page
 	return render_template("explorer/explorer.html", dataset=dataset, datasource=datasource, has_database=has_database, posts=posts, annotation_fields=annotation_fields, annotations=annotations, template=template, posts_css=posts_css, page=page, offset=offset, posts_per_page=posts_per_page, post_count=post_count, max_posts=max_posts, warning=warning)
 

From f0e61c3e00a83d1e2642e687aea73427268f8133 Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Tue, 13 Aug 2024 19:30:13 +0200
Subject: [PATCH 126/204] Make annotations editable and saveable in Explorer

---
 common/lib/annotation.py                      |  22 +-
 common/lib/dataset.py                         |   8 +-
 webtool/static/js/explorer.js                 |  83 +++----
 .../templates/explorer/post-annotations.html  |  34 ++-
 webtool/views/views_explorer.py               | 222 +++++++++---------
 5 files changed, 207 insertions(+), 162 deletions(-)

diff --git a/common/lib/annotation.py b/common/lib/annotation.py
index b36ae566c..aadee1205 100644
--- a/common/lib/annotation.py
+++ b/common/lib/annotation.py
@@ -86,6 +86,9 @@ def __init__(self, data=None, id=None, db=None):
 
         # If we were able to retrieve an annotation from the db, it already exists
         if current:
+
+            #current["metadata"] = json.loads(current["metadata"])
+
             # Check if we have to overwrite old data with new data
             if data:
                 for key, value in data.items():
@@ -110,7 +113,7 @@ def __init__(self, data=None, id=None, db=None):
                 "dataset": data["dataset"],
                 "item_id": data["item_id"],
                 "field_id": data["field_id"] if data.get("field_id") else self.set_field_id(data["dataset"], data["label"]),
-                "timestamp": 0,
+                "timestamp": created_timestamp,
                 "timestamp_created": created_timestamp,
                 "label": data["label"],
                 "type": data.get("type", "text"),
@@ -124,7 +127,22 @@ def __init__(self, data=None, id=None, db=None):
             self.data = new_data
             new_or_updated = True
 
+        if isinstance(self.data["metadata"], str):
+            try:
+                self.metadata = json.loads(self.data["metadata"])
+            except (TypeError, json.JSONDecodeError):
+                self.metadata = {}
+
         for k, v in self.data.items():
+            # Some type checking
+            try:
+                if k == "timestamp" or k == "timestamp_created":
+                    v = int(v)
+                elif k == "by_processor":
+                    v = bool(v)
+            except ValueError as e:
+                raise AnnotationException("Annotation fields are not of the right type (%s)" % e)
+
             self.__setattr__(k, v)
 
         # Write to db if anything changed
@@ -400,9 +418,9 @@ def __setattr__(self, attr, value):
         if attr == "metadata":
             value = json.dumps(value)
 
+        self.timestamp = int(time.time())
         self.db.update("annotations", where={"id": self.id}, data={attr: value})
 
         self.data[attr] = value
-
         if attr == "metadata":
             self.metadata = json.loads(value)
\ No newline at end of file
diff --git a/common/lib/dataset.py b/common/lib/dataset.py
index d1aeb9838..56aeb5b5e 100644
--- a/common/lib/dataset.py
+++ b/common/lib/dataset.py
@@ -1591,13 +1591,17 @@ def get_annotations(self, item_id=[]) -> list:
 		"""
 
 		annotations = []
+
+		# Get annotation IDs first
 		if item_id:
+			# Get specific annotations if IDs are given
 			if isinstance(item_id, str) or isinstance(item_id, int):
 				item_id = [item_id]
 			item_id = [str(i) for i in item_id]
 			ids = self.db.fetchall("SELECT id FROM annotations WHERE dataset = %s AND item_id IN %s;",
 								   (self.key, tuple(item_id)))
 		else:
+			# Else just get all the annotation data from this dataset
 			ids = self.db.fetchall("SELECT id FROM annotations WHERE dataset = %s;", (self.key,))
 
 		if not ids:
@@ -1605,6 +1609,7 @@ def get_annotations(self, item_id=[]) -> list:
 
 		ids = [i["id"] for i in ids]
 
+		# Then get the annotations by ID
 		for id in ids:
 			annotations.append(Annotation.get_by_id(id, self.db))
 
@@ -1661,8 +1666,7 @@ def save_annotations(self, annotations: list, overwrite=True) -> int:
 
 		:param list annotations:		List of dictionaries with annotation items. Must have `item_id` and `label`.
 										E.g. [{"item_id": "12345", "label": "Valid", "value": "Yes"}]
-		:param bool overwrite:			Whether to overwrite annotation if the label is already present
-										for the dataset.
+		:param bool overwrite:			Whether to overwrite the annotation if it already present.
 
 		:returns int:					How many annotations were saved.
 
diff --git a/webtool/static/js/explorer.js b/webtool/static/js/explorer.js
index 35104d28b..6648c8ced 100644
--- a/webtool/static/js/explorer.js
+++ b/webtool/static/js/explorer.js
@@ -317,7 +317,7 @@ const annotations = {
 		return annotation_fields;
 	},
 
-	parseAnnotation: function(e) {
+	parseAnnotation: function(el) {
 		/*
 		Converts the DOM objects of an annotation field
 		to an annotation Object.
@@ -326,34 +326,35 @@ const annotations = {
 
 		*/
 
-		annotation = {}
+		let ann = $(el)
+		let field_id = ann.attr("class").split(" ")[1].replace("field-", "");
+		let annotation_type = ann.attr("class").split(" ")[2].replace("type-", "");
+		let item_id = ann.attr("class").split(" ")[3].replace("item-id-", "");
+		let author = "Jan"
+		let label = ann.find(".annotation-label").text();
 
-		let label = $(this).find(".annotation-label").text();
-		let annotation_type = $(this).attr("class").split(" ").pop();
 		let val = undefined;
 		let edited = false
-		let timestamp = Date.now() / 100
 
 		if (annotation_type === "text" || annotation_type === "textarea") {
-			val = $(this).find(".post-annotation-input").val();
+			val = ann.find(".post-annotation-input").val();
 			// It can be the case that the input text is deleted
 			// In this case we *do* want to push new data, so we check
 			// whether there's an 'edited' class present and save if so.
-			if ($(this).find(".post-annotation-input").hasClass("edited")) {
+			if (ann.find(".post-annotation-input").hasClass("edited")) {
 				edited = true
 			}
 		}
 		else if (annotation_type === "dropdown") {
-			let selected = $(this).find(".post-annotation-options").val();
-			val = selected;
+			val = ann.find(".post-annotation-options").val();
 		}
 		else if (annotation_type === "checkbox") {
 			val = [];
-			$(this).find(".post-annotation-options > input").each(function(){
-				if ($(this).is(":checked")) {
-					val.push($(this).val());
+			ann.find(".post-annotation-options > input").each(function(){
+				if (ann.is(":checked")) {
+					val.push(ann.val());
 				}
-				if ($(this).hasClass("edited")) {
+				if (ann.hasClass("edited")) {
 					edited = true
 				}
 			});
@@ -361,27 +362,27 @@ const annotations = {
 				val = undefined;
 			}
 		}
-		if ((val !== undefined && val !== "") || edited) {
+		/*if ((val !== undefined && val !== "") || edited) {
 			vals_changed = true;
 			val = "";
-		}
+			console.log("EDITED")
+		}*/
 
+		/*if (vals_changed){
+			annotation[post_id] = post_vals;
+		}
+*/
 		// Create an annotation object and add them to the array.
 		let annotation = {
-			"field_id": "",
-			"post_id": post_id,
-			"dataset": "",
-			"timestamp": timestamp,
-			"timestamp_created": "",
+			"field_id": field_id,
+			"item_id": item_id,
 			"label": label,
 			"type": annotation_type,
-			"options": "",
-			"value": "",
-			"author": "",
-			"by_processor": "",
-			"metadata": ""
+			"value": val,
+			"author": author,
+			"by_processor": false // Explorer annotations are human-made!
 		}
-
+		console.log(annotation)
 		return annotation
 	},
 
@@ -494,7 +495,7 @@ const annotations = {
 
 					// For dropdowns and checkboxes, we're checking whether we 
 					// have to add or change any of their options.
-					else if (input_type == "checkbox" || input_type == "dropdown"){
+					else if (input_type === "checkbox" || input_type === "dropdown"){
 
 						let options = annotation_fields[field].options;
 						let valid_options = [];
@@ -521,10 +522,10 @@ const annotations = {
 										let post_id = $(this).parents("li").attr("id").split("post-")[1];
 										post_option_id = post_id + "-" + option_id;
 
-										if (input_type == "dropdown") {
+										if (input_type === "dropdown") {
 											$(this).append("<option class='post-annotation-input option-" + option_id + "' id='option-" + post_option_id + "' value='" + new_label + "'>" + new_label + "</option>");
 										}
-										else if (input_type == "checkbox") {
+										else if (input_type === "checkbox") {
 											$(this).append("<input class='post-annotation-input option-" + option_id + "' id='option-" + post_option_id + "' value='" + new_label + "' type='checkbox'><label for='option-" + post_option_id + "'>" + new_label + "</label>");
 										}
 									});
@@ -577,15 +578,15 @@ const annotations = {
 				el = "<div class='post-annotation " + input_id + " " + input_type + "'><label class='annotation-label' for='" + add_field + "{POST_ID}'>" + input_label + "</label>";
 
 				// Add a text input for text fields
-				if (input_type == "text") {
+				if (input_type === "text") {
 					el += "<input type='text' class='post-annotation-input text-" + add_field + "'>";
 				}
-				else if (input_type == "textarea") {
+				else if (input_type === "textarea") {
 					el += "<textarea class='post-annotation-input textarea-" + add_field + "'></textarea>";
 				}
 
 				// Add a dropdown for dropdown fields
-				else if (input_type == "dropdown") {
+				else if (input_type === "dropdown") {
 
 					el += "<select class='post-annotation-options select-" + add_field + "' id='options-" + add_field + "-{POST_ID}'>";
 					
@@ -605,7 +606,7 @@ const annotations = {
 				}
 
 				// Add checkboxes for checkbox fields
-				else if (input_type == "checkbox") {
+				else if (input_type === "checkbox") {
 
 					el += "<div class='post-annotation-options checkboxes-" + add_field + "'>";
 					let options = fields_to_add[add_field].options;
@@ -701,12 +702,11 @@ const annotations = {
 
 		// First we're going to collect the data for this page.
 		// Loop through each post's annotation fields.
-		var anns = [];
-		var dataset_key = $("#dataset-key").text();
+		let anns = [];
+		let dataset_key = $("#dataset-key").text();
 
 		$(".posts > li").each(function(){
 
-			let post_id = this.id.replace("post-", "");
 			let vals_changed = false;
 			let post_annotations = $(this).find(".post-annotations");
 
@@ -715,17 +715,12 @@ const annotations = {
 				post_annotations.find(".post-annotation").each(function(){
 					
 					// Extract annotation object from the element
-					let annotation = parseAnnotation(this);
+					let annotation = annotations.parseAnnotation(this);
 
 					if (annotation) {
-						annotations.push(annotation);
+						anns.push(annotation);
 					}
 				});
-
-				if (vals_changed){
-					annotation[post_id] = post_vals;
-				}
-
 			}
 		})
 		
@@ -741,7 +736,7 @@ const annotations = {
 
 			success: function (response) {
 
-				if (response == 'success') {
+				if (response === 'success') {
 					code = response
 
 					annotations.enableSaving();
diff --git a/webtool/templates/explorer/post-annotations.html b/webtool/templates/explorer/post-annotations.html
index 94b57b47a..f439a3dd3 100644
--- a/webtool/templates/explorer/post-annotations.html
+++ b/webtool/templates/explorer/post-annotations.html
@@ -10,17 +10,17 @@
             {% set type = annotation_fields[field]["type"] %}
 			{% set label = annotation_fields[field]["label"] %}
 
-            {% set annotation = {} %}
+            {# Loop through annotations for this post and
+            retrieve the data from the one matching this annotation field #}
+            {% set an = namespace(an={}) %}
 			{% for post_annotation in post_annotations %}
-                {{ field }}
 				{% if post_annotation.field_id == field %}
-                    {% set annotation = post_annotation %}
-                    {{ annotation }}
+                    {% set an.an = post_annotation %}
     			{% endif %}
             {% endfor %}
-            {{ annotation }}
-
-			<div class="post-annotation field-{{ field }} {{ type }}"><label class="annotation-label">{{ label }}</label>
+            {% set annotation = an.an %}
+			<div class="post-annotation field-{{ field }} type-{{ type }} item-id-{{ post.id }}">
+                <label class="annotation-label">{{ label }}</label>
 			{% if type == 'text' %}
 				<input type="text" class="post-annotation-input text-{{ field }}" value="{{ annotation.value }}">
 
@@ -34,7 +34,7 @@
 				{% for option in annotation_fields[field]["options"] %}
 					{% set option_id = option.keys() | first %}
 					{% set option_label = option.values() | first %}
-					<option class="post-annotation-input option-{{ option_id }}" id="option-{{ option_id }}" value="{{ option_label }}" {% if option_label == annotation %}selected{% endif %}>{{ option_label }}</option>
+					<option class="post-annotation-input option-{{ option_id }}" id="option-{{ option_id }}" value="{{ option_label }}" {% if option_label == annotation.value %}selected{% endif %}>{{ option_label }}</option>
 				{% endfor %}
 				</select>
 
@@ -50,6 +50,24 @@
 				</div>
 			{% endif %}
 
+            {# Tooltip with metadata on the annotation #}
+            {% if annotation.author or annotation.timestamp or annotation.metadata %}
+                <button class="tooltip-trigger" aria-controls="tooltip-annotation-metadata-{{ post.id }}-{{ annotation.field_id }}">?</button>
+                <p role="tooltip" id="tooltip-annotation-metadata-{{ post.id }}-{{ annotation.field_id }}" aria-hidden="true">
+                {% if annotation.author or annotation.timestamp %}
+                    Last edited by{% if annotation.by_processor %} processor{% endif %} {% if annotation.author %}<strong>{{ annotation.author }}</strong>{% endif %}{% if annotation.timestamp %} on {{ annotation.timestamp |  datetime(fmt="%d %B, %H:%M", wrap=False) }}{% endif %}
+                {% endif %}
+                {# Extra stuff we may want to display #}
+                {% if annotation.metadata %}
+                    {% set metadata = annotation.metadata | fromjson %}
+                    {% if metadata.get("processor-parameters") %}
+                        {% for parameter in metadata["processor-parameters"] %}
+                            <span class="property-badge"><i class="fa fa-check" aria-hidden="true"></i> {{ parameter }}</span>
+                        {% endfor %}
+                    {% endif %}
+                </p>
+                {% endif %}
+            {% endif %}
 			</div>
 		{% endfor %}
 	{% endif %}
diff --git a/webtool/views/views_explorer.py b/webtool/views/views_explorer.py
index b0ba04696..88a2b64d8 100644
--- a/webtool/views/views_explorer.py
+++ b/webtool/views/views_explorer.py
@@ -3,8 +3,6 @@
 format and lets users annotate the data.
 """
 
-import json
-
 from pathlib import Path
 
 from flask import request, render_template
@@ -12,7 +10,6 @@
 from webtool import app, db, openapi, limiter, config
 from webtool.lib.helpers import error, setting_required
 from common.lib.dataset import DataSet
-from common.lib.annotation import Annotation
 from common.lib.helpers import convert_to_float
 from common.lib.exceptions import DataSetException
 from common.config_manager import ConfigWrapper
@@ -20,13 +17,13 @@
 config = ConfigWrapper(config, user=current_user, request=request)
 api_ratelimit = limiter.shared_limit("45 per minute", scope="api")
 
-@app.route('/results/<string:key>/explorer/', defaults={'page': 1})
-@app.route('/results/<string:key>/explorer/page/<int:page>')
+@app.route('/results/<string:dataset_key>/explorer/', defaults={'page': 1})
+@app.route('/results/<string:dataset_key>/explorer/page/<int:page>')
 @api_ratelimit
 @login_required
 @setting_required("privileges.can_use_explorer")
 @openapi.endpoint("explorer")
-def explorer_dataset(key, page=1):
+def explorer_dataset(dataset_key: str, page=1):
 	"""
 	Show posts from a dataset
 
@@ -39,7 +36,7 @@ def explorer_dataset(key, page=1):
 
 	# Get dataset info.
 	try:
-		dataset = DataSet(key=key, db=db)
+		dataset = DataSet(key=dataset_key, db=db)
 	except DataSetException:
 		return error(404, error="Dataset not found.")
 	
@@ -48,7 +45,6 @@ def explorer_dataset(key, page=1):
 	datasource = parameters["datasource"]
 	post_count = int(dataset.data["num_rows"])
 	annotation_fields = dataset.get_annotation_fields()
-	datasource_config = config.get("explorer.config", {}).get(datasource,{})
 	warning = ""
 
 	# See if we can actually serve this page
@@ -76,7 +72,7 @@ def explorer_dataset(key, page=1):
 
 	# If the dataset is generated from an API-accessible database, we can add 
 	# extra features like the ability to navigate across posts.
-	has_database = False # INTEGRATE LATER /////////////////////
+	has_database = False # todo: integrate
 
 	# Check if we have to sort the data.
 	sort = request.args.get("sort")
@@ -92,7 +88,7 @@ def explorer_dataset(key, page=1):
 	# Load annotations with post IDs as keys and their annotations as lists.
 	annotations = {}
 
-	# We don't need to sort if we're showing the existing dataset order (the default).
+	# We don't need to sort if we're showing the existing dataset order (default).
 	# If we're sorting, we need to iterate over the entire dataset first.
 	if not sort or (sort == "dataset-order" and reverse == False):
 		for row in dataset.iterate_items(warn_unmappable=False):
@@ -125,16 +121,17 @@ def explorer_dataset(key, page=1):
 
 	# Check whether there's already annotations made.
 	# If so, also pass these to the template and set the post ID
-	# as key so we can easily retrieve them.
+	# as key, so we can easily retrieve them.
 	for post_id in post_ids:
 		annotations[post_id] = dataset.get_annotations(item_id=post_id)
 
-	# We can use either a generic or a pre-made data source-specific template.
+	# We can use either a generic or a pre-made, data source-specific template.
 	template = "datasource" if has_datasource_template(datasource) else "generic"
 	if template == "generic":
 		posts_css = Path(config.get('PATH_ROOT'), "webtool/static/css/explorer/generic.css")
 	else:
 		posts_css = Path(config.get('PATH_ROOT'), "webtool/static/css/explorer/" + datasource + ".css")
+
 	# Read CSS and pass as a string
 	with open(posts_css, "r", encoding="utf-8") as css:
 		posts_css = css.read()
@@ -142,89 +139,17 @@ def explorer_dataset(key, page=1):
 	# Generate the HTML page
 	return render_template("explorer/explorer.html", dataset=dataset, datasource=datasource, has_database=has_database, posts=posts, annotation_fields=annotation_fields, annotations=annotations, template=template, posts_css=posts_css, page=page, offset=offset, posts_per_page=posts_per_page, post_count=post_count, max_posts=max_posts, warning=warning)
 
-@app.route('/results/<datasource>/<string:thread_id>/explorer')
-@api_ratelimit
-@login_required
-@setting_required("privileges.can_use_explorer")
-@openapi.endpoint("explorer")
-def explorer_api_thread(datasource, thread_id):
-	"""
-	/// INTEGRATE LATER!
-
-	Show a thread from an API-accessible database.
-
-	:param str datasource:  Data source ID
-	:param str board:  Board name
-	:param int thread_id:  Thread ID
-
-	:return-error 404:  If the thread ID does not exist for the given data source.
-	"""
-
-	if not datasource:
-		return error(404, error="No datasource provided")
-	if datasource not in config.get('datasources.enabled'):
-		return error(404, error="Invalid data source")
-	if not thread_id:
-		return error(404, error="No thread ID provided")
-
-	# The amount of posts that may be included (limit for large datasets)
-	max_posts = config.get('explorer.max_posts', 500000)
-
-	# Get the posts with this thread ID.
-	posts = get_local_posts(db, datasource, ids=tuple([thread_id]), threads=True, order_by=["id"])
-
-	if not posts:
-		return error(404, error="No posts available for this thread")
-
-	posts = [strip_html(post) for post in posts]
-	posts = [format(post, datasource=datasource) for post in posts]
-
-	return render_template("explorer/explorer.html", datasource=datasource, posts=posts, datasource_config=datasource_config, posts_per_page=len(posts), post_count=len(posts), thread=thread_id, max_posts=max_posts)
-
-@app.route('/explorer/post/<datasource>/<board>/<string:post_id>')
-@api_ratelimit
-@login_required
-@setting_required("privileges.can_use_explorer")
-@openapi.endpoint("explorer")
-def explorer_api_posts(datasource, post_ids):
-	"""
-	/// INTEGRATE LATER
-
-	Show posts from an API-accessible database.
-
-	:param str datasource:  Data source ID
-	:param str board:  Board name
-	:param int post_ids:  Post IDs
-
-	:return-error 404:  If the thread ID does not exist for the given data source.
-	"""
-
-	if not datasource:
-		return error(404, error="No datasource provided")
-	if datasource not in config.get('datasources.enabled'):
-		return error(404, error="Invalid data source")
-	if not post_ids:
-		return error(404, error="No thread ID provided")
-
-	# Get the posts with this thread ID.
-	posts = get_database_posts(db, datasource, board=board, ids=tuple([post_ids]), threads=True, order_by=["id"])
-
-	posts = [strip_html(post) for post in posts]
-	posts = [format(post) for post in posts]
-
-	return render_template("explorer/explorer.html", datasource=datasource, board=board, posts=posts, datasource_config=datasource_config, posts_per_page=len(posts), post_count=len(posts))
-
 @app.route("/explorer/save_annotation_fields/<string:key>", methods=["POST"])
 @api_ratelimit
 @login_required
 @setting_required("privileges.can_run_processors")
 @setting_required("privileges.can_use_explorer")
 @openapi.endpoint("explorer")
-def explorer_save_annotation_fields(key):
+def explorer_save_annotation_fields(key: str) -> int:
 	"""
 	Save the annotation fields of a dataset to the datasets table.
 
-	:param str key:  	The dataset key.
+	:param key:  		The dataset key.
 
 	:return-error 404:  If the dataset ID does not exist.
 	:return int:		The number of annotation fields saved.
@@ -244,38 +169,46 @@ def explorer_save_annotation_fields(key):
 
 	return "success"
 
-@app.route("/explorer/save_annotations/<string:key>", methods=["POST"])
+@app.route("/explorer/save_annotations/<string:dataset_key>", methods=["POST"])
 @api_ratelimit
 @login_required
 @setting_required("privileges.can_run_processors")
 @setting_required("privileges.can_use_explorer")
 @openapi.endpoint("explorer")
-def explorer_save_annotations(key=None):
+def explorer_save_annotations(dataset_key: str):
 	"""
 	Save the annotations of a dataset to the annotations table.
 
-	:param str key: 	The dataset key.
+	:param dataset_key:	  	The dataset key. Must be explicitly given to ensure
+							annotations are tied to a dataset
+
+	:return-error 404:  	If the dataset key does not exist.
 
-	:return-error 404:  If the dataset key does not exist.
-	:return int:		The number of posts with annotations saved.
 	"""
 
 	# Save it!
 	annotations = request.get_json()
-
-	# Annotations are always associated with a dataset.
-	if not key and annotations:
-		key = annotations[0].get("dataset", "")
-	if not key:
-		return error(404, error="No dataset key provided")
 	try:
-		dataset = DataSet(key=key, db=db)
+		dataset = DataSet(key=dataset_key, db=db)
 	except DataSetException:
 		return error(404, error="Dataset not found.")
-	
-	return dataset.save_annotations(annotations)
 
-def sort_and_iterate_items(dataset, sort=None, reverse=False, **kwargs):
+	dataset.save_annotations(annotations, overwrite=True)
+	return "success"
+
+@app.route("/explorer/save_annotation/<string:key>", methods=["POST"])
+@api_ratelimit
+@login_required
+@setting_required("privileges.can_run_processors")
+@setting_required("privileges.can_use_explorer")
+@openapi.endpoint("explorer")
+def explorer_save_annotation(key="") -> int:
+	"""
+	todo: integrate
+	"""
+	return 0
+
+def sort_and_iterate_items(dataset: DataSet, sort=None, reverse=False, **kwargs) -> dict:
 	"""
 	Loop through both csv and NDJSON files.
 	This is basically a wrapper function for `iterate_items()` with the
@@ -284,9 +217,11 @@ def sort_and_iterate_items(dataset, sort=None, reverse=False, **kwargs):
 	This first iterates through the entire file (with a max limit) to determine
 	an order. Then it yields items based on this order.
 
-	:param key, str:			The dataset object.
+	:param dataset, str:		The dataset object.
 	:param sort_by, str:		The item key that determines the sort order.
 	:param reverse, bool:		Whether to sort by largest values first.
+
+	:returns dict:				Yields iterated post
 	"""
 
 	# Storing posts in the right order here
@@ -314,6 +249,7 @@ def sort_and_iterate_items(dataset, sort=None, reverse=False, **kwargs):
 
 def get_database_posts(db, datasource, ids, board="", threads=False, limit=0, offset=0, order_by=["timestamp"]):
 	"""
+	todo: Integrate later
 	Retrieve posts by ID from a database-accessible data source.
 	"""
 
@@ -335,19 +271,93 @@ def get_database_posts(db, datasource, ids, board="", threads=False, limit=0, of
 
 	return posts
 
-def has_datasource_template(datasource):
+def has_datasource_template(datasource: str) -> bool:
 	"""
 	Check if the data source has a data source-specific template.
 	This requires HTML and CSS files.
 	Custom HTML files should be placed in `webtool/templates/explorer/datasource-templates/<datasource name>.html`.
 	Custom CSS files should be placed in `webtool/static/css/explorer/<datasource name>.css`.
 
-	:param datasource, str:	Datasource name.
-	:return: bool, Whether the required files are present.
+	:param datasource:	Datasource name.
+
+	:returns: bool, Whether the required files are present.
 	"""
 	css_exists = Path(config.get('PATH_ROOT'), "webtool/static/css/explorer/" + datasource + ".css").exists()
 	html_exists = Path(config.get('PATH_ROOT'), "webtool/templates/explorer/datasource-templates/" + datasource + ".html").exists()
 
 	if css_exists and html_exists:
 		return True
-	return False
\ No newline at end of file
+	return False
+
+@app.route('/results/<datasource>/<string:thread_id>/explorer')
+@api_ratelimit
+@login_required
+@setting_required("privileges.can_use_explorer")
+@openapi.endpoint("explorer")
+def explorer_api_thread(datasource, thread_id):
+	"""
+	todo: INTEGRATE LATER!
+
+	Show a thread from an API-accessible database.
+
+	:param str datasource:  Data source ID
+	:param str board:  Board name
+	:param int thread_id:  Thread ID
+
+	:return-error 404:  If the thread ID does not exist for the given data source.
+	"""
+
+	if not datasource:
+		return error(404, error="No datasource provided")
+	if datasource not in config.get('datasources.enabled'):
+		return error(404, error="Invalid data source")
+	if not thread_id:
+		return error(404, error="No thread ID provided")
+
+	# The amount of posts that may be included (limit for large datasets)
+	max_posts = config.get('explorer.max_posts', 500000)
+
+	# Get the posts with this thread ID.
+	#todo: define function get_api_posts
+	posts = get_api_posts(db, datasource, ids=tuple([thread_id]), threads=True, order_by=["id"])
+
+	if not posts:
+		return error(404, error="No posts available for this thread")
+
+	posts = [strip_html(post) for post in posts]
+	posts = [format(post, datasource=datasource) for post in posts]
+
+	return render_template("explorer/explorer.html", datasource=datasource, posts=posts, datasource_config=datasource_config, posts_per_page=len(posts), post_count=len(posts), thread=thread_id, max_posts=max_posts)
+
+@app.route('/explorer/post/<datasource>/<board>/<string:post_id>')
+@api_ratelimit
+@login_required
+@setting_required("privileges.can_use_explorer")
+@openapi.endpoint("explorer")
+def explorer_api_posts(datasource, post_ids):
+	"""
+	todo: INTEGRATE LATER
+
+	Show posts from an API-accessible database.
+
+	:param str datasource:  Data source ID
+	:param str board:  Board name
+	:param int post_ids:  Post IDs
+
+	:return-error 404:  If the thread ID does not exist for the given data source.
+	"""
+
+	if not datasource:
+		return error(404, error="No datasource provided")
+	if datasource not in config.get('datasources.enabled'):
+		return error(404, error="Invalid data source")
+	if not post_ids:
+		return error(404, error="No thread ID provided")
+
+	# Get the posts with this thread ID.
+	posts = get_database_posts(db, datasource, board=board, ids=tuple([post_ids]), threads=True, order_by=["id"])
+
+	posts = [strip_html(post) for post in posts]
+	posts = [format(post) for post in posts]
+
+	return render_template("explorer/explorer.html", datasource=datasource, board=board, posts=posts, datasource_config=datasource_config, posts_per_page=len(posts), post_count=len(posts))
\ No newline at end of file

From 28032b5ad820f27edd64e7db203391e461b68eb9 Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Mon, 19 Aug 2024 13:08:03 +0200
Subject: [PATCH 127/204] Make Tumblr search code a bit neater

---
 datasources/tumblr/search_tumblr.py | 239 +++++++++++++++-------------
 1 file changed, 126 insertions(+), 113 deletions(-)

diff --git a/datasources/tumblr/search_tumblr.py b/datasources/tumblr/search_tumblr.py
index 0f696507b..ddf02f023 100644
--- a/datasources/tumblr/search_tumblr.py
+++ b/datasources/tumblr/search_tumblr.py
@@ -15,7 +15,6 @@
 import json
 from requests.exceptions import ConnectionError
 from datetime import datetime
-from ural import urls_from_text
 
 from common.config_manager import config
 from backend.lib.search import Search
@@ -23,12 +22,12 @@
 from common.lib.exceptions import QueryParametersException, ProcessorInterruptedException, ConfigException
 from common.lib.item_mapping import MappedItem
 
-
 __author__ = "Sal Hagen"
 __credits__ = ["Sal Hagen", "Tumblr API (api.tumblr.com)"]
 __maintainer__ = "Sal Hagen"
 __email__ = "4cat@oilab.eu"
 
+
 class SearchTumblr(Search):
 	"""
 	Tumblr data filter module.
@@ -38,15 +37,17 @@ class SearchTumblr(Search):
 	title = "Search Tumblr"  # title displayed in UI
 	description = "Retrieve Tumblr posts by tags or blogs."  # description displayed in UI
 	extension = "ndjson"  # extension of result file, used internally and in UI
-	is_local = False	# Whether this datasource is locally scraped
-	is_static = False	# Whether this datasource is still updated
+	is_local = False  # Whether this datasource is locally scraped
+	is_static = False  # Whether this datasource is still updated
 
 	# not available as a processor for existing datasets
 	accepts = [None]
 
 	max_workers = 1
-	max_retries = 3 # For API and connection retries.
-	max_date_retries = 96 + 150 # For checking dates. 96 time retries of -6 hours (24 days), plus 150 extra for 150 weeks (~3 years).
+	# For API and connection retries.
+	max_retries = 3
+	# For checking dates. 96 retries of -6 hours (24 days), plus 150 extra for 150 weeks (~3 years).
+	max_date_retries = 96 + 150
 	max_posts = 1000000
 	max_reblogs = 1000
 
@@ -109,23 +110,21 @@ def get_options(cls, parent_dataset=None, user=None):
 			"query": {
 				"type": UserInput.OPTION_TEXT_LARGE,
 				"help": "Tags, blogs, or post URLs.",
-				"tooltip": " Seperate with comma or newline. Example:\n#research tools, @4catblog, https://tumblr.com/4catblog/12347714095"
+				"tooltip": "Seperate with comma or newline, e.g.: #research tools, @4catblog, https://tumblr.com/4catblog/123456789"
 			},
 			"get_notes": {
 				"type": UserInput.OPTION_TOGGLE,
 				"help": "Add note data (warning: slow)",
 				"tooltip": "Add note data for every post. This includes note metrics, "
-							"replies, reblogged text, and reblogged images. "
-							"Blog- and id-level search includes reblogged text by default. "
-							"Enables adding reblogs as new posts "
-							"Limited to the first 1,000 reblogs per post.",
+						   "replies, reblogged text, and reblogged images. "
+						   "Blog- and id-level search includes reblogged text by default. "
+						   "Limited to the first 1,000 reblogs per post.",
 				"default": False
 			},
 			"get_reblogs": {
 				"type": UserInput.OPTION_TOGGLE,
 				"help": "Add reblogs",
-				"tooltip": "Add reblogs to the dataset. "
-							"",
+				"tooltip": "Add reblogs of initially captured posts as new posts to the dataset. ",
 				"requires": "get_notes==true",
 				"default": False
 			},
@@ -149,7 +148,7 @@ def get_options(cls, parent_dataset=None, user=None):
 		}
 
 		try:
-			config_keys = SearchTumblr.get_tumblr_keys(user)
+			SearchTumblr.get_tumblr_keys(user)
 		except ConfigException:
 			# No 4CAT set keys for user; let user input their own
 			options["key-info"] = {
@@ -185,24 +184,24 @@ def get_options(cls, parent_dataset=None, user=None):
 			}
 
 		options["divider"] = {
-				"type": UserInput.OPTION_DIVIDER
-			}
+			"type": UserInput.OPTION_DIVIDER
+		}
 		options["date-intro"] = {
-				"type": UserInput.OPTION_INFO,
-				"help": "**Note:** The [Tumblr API](https://api.tumblr.com) is very volatile. Queries may not return "
-						"posts, even if posts exists. Waiting for a while and querying again can help, even with identical queries. "
-						"Consider carrying out multiple queries and using the 'Merge datasets' processor to limit false negatives.\n\n"
-						"Additionally, older tagged posts may not be returned, even if they exist. To mitigate this, 4CAT decreases "
-						"the date parameter (<code>before</code>) with six hours and sends the query again. This often "
-						"successfully returns older, un-fetched posts. If it didn't find new data after 96 retries (24 "
-						"days), it checks for data up to six years before the last date, decreasing 12 times by 6 months. "
-						"If that also results in nothing, it assumes the dataset is complete. Check the oldest post in "
-						"your dataset to see if it this is indeed the case and whether any odd time gaps exists."
-			}
+			"type": UserInput.OPTION_INFO,
+			"help": "**Note:** The [Tumblr API](https://api.tumblr.com) is very volatile. Queries may not return "
+					"posts, even if posts exists. Waiting for a while and querying again can help, even with identical queries. "
+					"Consider carrying out multiple queries and using the 'Merge datasets' processor to limit false negatives.\n\n"
+					"Additionally, older tagged posts may not be returned, even if they exist. To mitigate this, 4CAT decreases "
+					"the date parameter (<code>before</code>) with six hours and sends the query again. This often "
+					"successfully returns older, un-fetched posts. If it didn't find new data after 96 retries (24 "
+					"days), it checks for data up to six years before the last date, decreasing 12 times by 6 months. "
+					"If that also results in nothing, it assumes the dataset is complete. Check the oldest post in "
+					"your dataset to see if it this is indeed the case and whether any odd time gaps exists."
+		}
 		options["daterange"] = {
-				"type": UserInput.OPTION_DATERANGE,
-				"help": "Date range"
-			}
+			"type": UserInput.OPTION_DATERANGE,
+			"help": "Date range"
+		}
 
 		return options
 
@@ -240,14 +239,16 @@ def get_items(self, query):
 		# Connect to Tumblr API
 		try:
 			self.client = self.connect_to_tumblr()
-		except ConfigException as e:
+		except ConfigException:
 			self.log.warning(f"Could not connect to Tumblr API: API keys invalid or not set")
 			self.dataset.finish_with_error(f"Could not connect to Tumblr API: API keys invalid or not set")
 			return
 		except ConnectionRefusedError as e:
 			client_info = self.client.info()
 			self.log.warning(f"Could not connect to Tumblr API: {e}; client_info: {client_info}")
-			self.dataset.finish_with_error(f"Could not connect to Tumblr API: {client_info.get('meta', {}).get('status', '')} - {client_info.get('meta', {}).get('msg', '')}")
+			self.dataset.finish_with_error(
+				f"Could not connect to Tumblr API:"
+				f"{client_info.get('meta', {}).get('status', '')} - {client_info.get('meta', {}).get('msg', '')}")
 			return
 
 		# For each tag or blog, get posts
@@ -270,20 +271,20 @@ def get_items(self, query):
 
 			# Post URL
 			elif "tumblr.com/" in query:
-					
+
 				try:
 					# Format https://{blogname}.tumblr.com/post/{post_id}
 					if "/post/" in query:
 						blog_name = query.split(".tumblr.com")[0].replace("https://", "").replace("www.", "").strip()
 						post_id = query.split("/")[-1].strip()
-						# May also be a slug string..
+						# May also be a slug string.
 						if not post_id.isdigit():
 							post_id = query.split("/")[-2].strip()
 
 					# Format https://tumblr.com/{blogname}/{post_id}
 					else:
 						blog_and_id = query.split("tumblr.com/")[-1]
-						blog_and_id = blog_and_id.replace("blog/view/", "") # Sometimes present in the URL
+						blog_and_id = blog_and_id.replace("blog/view/", "")  # Sometimes present in the URL
 						blog_name, post_id = blog_and_id.split("/")
 						if not post_id.isdigit():
 							post_id = query.split("/")[-2].strip()
@@ -322,7 +323,7 @@ def get_items(self, query):
 				# The post rail is stored in the trail list
 				for trail_post in result.get("trail", []):
 					# Some posts or blogs have been deleted; skip these
-					if not "broken_blog_name" in trail_post:
+					if "broken_blog_name" not in trail_post:
 						if trail_post["post"]["id"] not in self.seen_ids:
 							extra_posts.append({"blog": trail_post["blog"]["name"],
 												"id": trail_post["post"]["id"]})
@@ -331,12 +332,13 @@ def get_items(self, query):
 		# Blog-level searches already have some note data, like reblogged text,
 		# but not everything (like replies), so we're going to retrieve these here as well.
 		# Also store IDs of reblogs/reblogged posts that we want to add.
-		if get_notes:
 
-			# Create a dictionary with the `reblog_key` as key and notes as value.
-			# Notes are the same for all posts in a reblog chain.
-			# This means that we may not have to re-query the same data.
-			retrieved_notes = {}
+		# Create a dictionary with the `reblog_key` as key and notes as value.
+		# Notes are the same for all posts in a reblog chain.
+		# This means that we may not have to re-query the same data.
+		retrieved_notes = {}
+
+		if get_notes:
 
 			for i, post in enumerate(results):
 
@@ -345,7 +347,7 @@ def get_items(self, query):
 				if self.api_limit_reached:
 					break
 
-				self.dataset.update_status("Retrieving notes for post %i/%i" % (i+1, len(results)))
+				self.dataset.update_status("Retrieving notes for post %i/%i" % (i + 1, len(results)))
 
 				# We may have already encountered this note-chain
 				# with a different post.
@@ -368,31 +370,33 @@ def get_items(self, query):
 					# Only gets first 1,000 replies or text/tag reblogs.
 
 					# We're using different querying modes since
-					# it'll speed up the process. The fastest is 
+					# it'll speed up the process. The fastest is
 					# `conversation`, which prioritises text reblogs and
 					# replies, and also provides metrics on like and reblog counts;
 					# we'll use this as default. If the user
 					# has indicated they also want to add reblogs with tags,
 					# we'll also use the `reblogs_with_tags` mode.
 					seen_notes = set()
-					notes = self.get_notes(post["blog_name"], post["id"], mode="conversation", max_reblogs=self.max_reblogs)
+					notes = self.get_notes(post["blog_name"], post["id"], mode="conversation",
+										   max_reblogs=self.max_reblogs)
 					reblog_count = 0
 					for note in notes["notes"]:
-						if note["type"] == "reblog": # Replies don't have IDs
+						if note["type"] == "reblog":  # Replies don't have IDs
 							reblog_count += 1
 							seen_notes.add(note["post_id"])
 
 					# Get tag-only reblogs; these aren't returned in `conversation` mode.
 					if reblog_type == "text_or_tag" and reblog_count <= self.max_reblogs:
-						tag_notes = self.get_notes(post["blog_name"], post["id"], mode="reblogs_with_tags", max_reblogs=self.max_reblogs - reblog_count)
+						tag_notes = self.get_notes(post["blog_name"], post["id"], mode="reblogs_with_tags",
+												   max_reblogs=self.max_reblogs - reblog_count)
 						for tag_note in tag_notes["notes"]:
 							if tag_note["post_id"] not in seen_notes:
 								notes["notes"].append(tag_note)
-				
+
 				# Add to posts
 				results[i] = {**results[i], **notes}
 				retrieved_notes[post["reblog_key"]] = notes
-				
+
 				# Identify which notes/reblogs we can collect as new posts
 				if get_reblogs:
 
@@ -411,24 +415,25 @@ def get_items(self, query):
 										continue
 
 							extra_posts.append({"blog": note["blog_name"], "id": note["post_id"]})
-		
+
 		# Add reblogged posts and reblogs to dataset
 		for i, extra_post in enumerate(extra_posts):
-			
+
 			self.dataset.update_status("Adding %s/%s reblogs to the dataset" % (i, len(extra_posts)))
 
 			if extra_post["id"] not in self.seen_ids:
-				
+
 				# Potentially skip new posts outside of the date range
 				# not always present in the notes data.
 				if not reblog_outside_daterange and (max_date and min_date):
-					new_post = self.get_posts_by_blog(extra_post["blog"], extra_post["id"], max_date=max_date, min_date=min_date)		
+					new_post = self.get_posts_by_blog(extra_post["blog"], extra_post["id"], max_date=max_date,
+													  min_date=min_date)
 				else:
 					new_post = self.get_posts_by_blog(extra_post["blog"], extra_post["id"])
 
 				if new_post:
 					new_post = new_post[0]
-						
+
 					# Add note data; these are already be retrieved above
 					if get_notes:
 						new_post = {**new_post, **retrieved_notes[new_post["reblog_key"]]}
@@ -442,9 +447,10 @@ def get_items(self, query):
 	def get_posts_by_tag(self, tag, max_date=None, min_date=None, api_key=None):
 		"""
 		Get Tumblr posts posts with a certain tag.
-		:param tag, str: the tag you want to look for
+		:param tag: the tag you want to look for
 		:param min_date: a unix timestamp, indicates posts should be min_date this date.
 		:param max_date: a unix timestamp, indicates posts should be max_date this date.
+		:param api_key: The api key.
 
 		:returns: a dict created from the JSON response
 		"""
@@ -455,7 +461,7 @@ def get_posts_by_tag(self, tag, max_date=None, min_date=None, api_key=None):
 		retries = 0
 		date_retries = 0
 
-		# We're gonna change max_date, so store a copy for reference.
+		# We're going to change max_date, so store a copy for reference.
 		max_date_original = max_date
 
 		# We use the average time difference between posts to spot possible gaps in the data.
@@ -479,7 +485,7 @@ def get_posts_by_tag(self, tag, max_date=None, min_date=None, api_key=None):
 				break
 
 			try:
-				# PyTumblr does not allow to use the `npf` parameter yet 
+				# PyTumblr does not allow to use the `npf` parameter yet
 				# for the `tagged` endpoint (opened a pull request), so
 				# we're using requests here.
 				params = {
@@ -494,19 +500,19 @@ def get_posts_by_tag(self, tag, max_date=None, min_date=None, api_key=None):
 				url = "https://api.tumblr.com/v2/tagged"
 				response = requests.get(url, params=params)
 				posts = response.json()["response"]
-				
+
 			except ConnectionError:
-				self.update_status("Encountered a connection error, waiting 10 seconds")
+				self.dataset.update_status("Encountered a connection error, waiting 10 seconds")
 				time.sleep(10)
 				retries += 1
 				continue
 
-			# Skip posts that we already enountered,
+			# Skip posts that we already encountered,
 			# preventing Tumblr API shenanigans or double posts because of
 			# time reductions. Make sure it's no error string, though.
 			new_posts = []
 			for post in posts:
-				# Sometimes the API repsonds just with "meta", "response", or "errors".
+				# Sometimes the API responds just with "meta", "response", or "errors".
 				if isinstance(post, str):
 					self.dataset.update_status("Couldn't add post:", post)
 					retries += 1
@@ -537,17 +543,18 @@ def get_posts_by_tag(self, tag, max_date=None, min_date=None, api_key=None):
 
 				date_retries += 1
 
-				# We're first gonna check carefully if there's small time gaps by
+				# We're first going to check carefully if there's small time gaps by
 				# decreasing by six hours.
 				# If that didn't result in any new posts, also dedicate 12 date_retries
 				# with reductions of six months, just to be sure there's no data from
 				# years earlier missing.
 
 				if date_retries < 96:
-					max_date -= 21600 # Decrease by six hours
+					max_date -= 21600  # Decrease by six hours
 				elif date_retries <= self.max_date_retries:
-					max_date -= 604800 # Decrease by one week
-					self.dataset.update_status("No new posts found for #%s - looking for posts before %s" % (tag, datetime.fromtimestamp(max_date).strftime("%Y-%m-%d %H:%M:%S")))
+					max_date -= 604800  # Decrease by one week
+					self.dataset.update_status("No new posts found for #%s - looking for posts before %s" % (
+						tag, datetime.fromtimestamp(max_date).strftime("%Y-%m-%d %H:%M:%S")))
 
 				# We can stop when the max date drops below the min date.
 				if min_date != 0:
@@ -587,7 +594,8 @@ def get_posts_by_tag(self, tag, max_date=None, min_date=None, api_key=None):
 					if len(all_posts) >= 250 and time_dif > (avg_time_dif * 5):
 
 						time_str = datetime.fromtimestamp(date).strftime("%Y-%m-%d %H:%M:%S")
-						self.dataset.update_status("Time difference of %s spotted, restarting query at %s" % (str(time_dif), time_str,))
+						self.dataset.update_status(
+							"Time difference of %s spotted, restarting query at %s" % (str(time_dif), time_str,))
 						posts = [post for post in posts if post["timestamp"] >= date]
 						if posts:
 							all_posts += posts
@@ -607,7 +615,7 @@ def get_posts_by_tag(self, tag, max_date=None, min_date=None, api_key=None):
 					if max_date < min_date:
 
 						# Get rid of all the posts that are earlier than the max_date timestamp
-						posts = [post for post in posts if post["timestamp"] >= min_date and post["timestamp"] <= max_date_original]
+						posts = [post for post in posts if min_date <= post["timestamp"] <= max_date_original]
 
 						if posts:
 							all_posts += posts
@@ -636,7 +644,8 @@ def get_posts_by_tag(self, tag, max_date=None, min_date=None, api_key=None):
 				self.max_posts_reached = True
 				break
 
-			self.dataset.update_status("Collected %s posts for #%s, retrieving posts before %s" % (str(len(all_posts)), tag, max_date_str,))
+			self.dataset.update_status(
+				"Collected %s posts for #%s, retrieving posts before %s" % (str(len(all_posts)), tag, max_date_str,))
 			time.sleep(.2)
 
 		return all_posts
@@ -644,10 +653,10 @@ def get_posts_by_tag(self, tag, max_date=None, min_date=None, api_key=None):
 	def get_posts_by_blog(self, blog, post_id=None, max_date=None, min_date=None):
 		"""
 		Get Tumblr posts from a certain blog
-		:param blog, str: the name of the blog you want to look for
-		:param post_id, str:	the post ID (optional)
-		:param max_date: a unix timestamp, indicates posts should be max_date this date.
-		:param min_date: a unix timestamp, indicates posts should be min_date this date.
+		:param blog:		the name of the blog you want to look for
+		:param post_id:		the post ID (optional)
+		:param max_date:	a unix timestamp, indicates posts should be max_date this date.
+		:param min_date:	a unix timestamp, indicates posts should be min_date this date.
 
 		:returns: a dict created from the JSON response
 		"""
@@ -656,7 +665,7 @@ def get_posts_by_blog(self, blog, post_id=None, max_date=None, min_date=None):
 
 		if post_id:
 			try:
-				test_id = int(post_id)
+				int(post_id)
 			except TypeError:
 				raise QueryParametersException("Post ID %s is invalid" % post_id)
 
@@ -668,7 +677,7 @@ def get_posts_by_blog(self, blog, post_id=None, max_date=None, min_date=None):
 
 		# Some retries to make sure the Tumblr API actually returns everything
 		retries = 0
-		self.max_retries = 48 # 2 days
+		self.max_retries = 48  # 2 days
 
 		# Get Tumblr posts until there's no more left.
 		while True:
@@ -682,7 +691,8 @@ def get_posts_by_blog(self, blog, post_id=None, max_date=None, min_date=None):
 
 			try:
 				# Use the pytumblr library to make the API call
-				posts = self.client.posts(blog, id=post_id, before=max_date, limit=20, reblog_info=True, notes_info=True, filter="raw", npf=True)
+				posts = self.client.posts(blog, id=post_id, before=max_date, limit=20, reblog_info=True,
+										  notes_info=True, filter="raw", npf=True)
 				posts = posts["posts"]
 
 			except ConnectionRefusedError:
@@ -691,23 +701,26 @@ def get_posts_by_blog(self, blog, post_id=None, max_date=None, min_date=None):
 					self.failed_posts.append(post_id)
 					self.dataset.update_status("ConnectionRefused: Unable to collect post %s/%s" % (blog, post_id))
 				else:
-					self.dataset.update_status("ConnectionRefused: Unable to collect posts for blog %s before %s" % (blog, max_date))
+					self.dataset.update_status(
+						"ConnectionRefused: Unable to collect posts for blog %s before %s" % (blog, max_date))
 				time.sleep(10)
 				continue
 
 			except Exception as e:
-				self.dataset.update_status("Reached the limit of the Tumblr API. Last timestamp: %s" % str(max_date))
+				self.dataset.update_status("Couldn't collect posts; likely reached the limit of the Tumblr API (%s)."
+										   "Last timestamp: %s" % (e, str(max_date)))
 				self.api_limit_reached = True
 				break
 
 			# Make sure the Tumblr API doesn't magically stop at an earlier date
 			if not posts or isinstance(posts, str):
 				retries += 1
-				max_date -= 3600 # Decrease by an hour
-				self.dataset.update_status("No posts returned by Tumblr - checking whether this is really all (retry %s/48)" % str(retries))
+				max_date -= 3600  # Decrease by an hour
+				self.dataset.update_status(
+					"No posts returned by Tumblr - checking whether this is really all (retry %s/48)" % str(retries))
 				continue
 
-			# Skip posts that we already enountered,
+			# Skip posts that we already encountered,
 			# preventing Tumblr API shenanigans or double posts because of
 			# time reductions. Make sure it's no error string, though.
 			new_posts = []
@@ -726,7 +739,7 @@ def get_posts_by_blog(self, blog, post_id=None, max_date=None, min_date=None):
 			# Possibly only keep posts within the date range.
 			if max_date and min_date:
 				new_posts = [p for p in new_posts if min_date <= p["timestamp"] <= max_date]
-	
+
 			if not new_posts:
 				break
 
@@ -749,16 +762,17 @@ def get_posts_by_blog(self, blog, post_id=None, max_date=None, min_date=None):
 
 		return all_posts
 
-	def get_notes(self, blog_id, post_id, mode="conversation", max_reblogs=1000):
+	def get_notes(self, blog_id, post_id, mode="conversation", max_reblogs=1000) -> dict:
 		"""
 		Gets data on the notes of a specific post.
-		:param blog_id, str: 	The ID of the blog.
-		:param post_id, str:	The ID of the post.
-		:param mode, str:		The type of notes that get priority.
-								`conversation` prioritises text reblogs and replies.
-		:param mode, max_reblogs:	Maximum amount of notes to return.
-
-		:returns: a list with dictionaries of notes.
+		:param blog_id:	 	The ID of the blog.
+		:param post_id:		The ID of the post.
+		:param mode:		The type of notes that get priority.
+							`conversation` prioritises text reblogs and replies.
+		:param mode:		Maximum amount of notes to return.
+		:param max_reblogs:	The number of reblogs to collect.
+
+		:returns: a dictionaries with notes and note metrics.
 		"""
 
 		post_notes = []
@@ -780,6 +794,7 @@ def get_notes(self, blog_id, post_id, mode="conversation", max_reblogs=1000):
 		stop_collecting = False
 
 		# For status updates
+		note_type = ""
 		if mode == "conversation":
 			note_type = "reblogs with text"
 		elif mode == "reblogs_with_tags":
@@ -800,11 +815,12 @@ def get_notes(self, blog_id, post_id, mode="conversation", max_reblogs=1000):
 
 				# Important: we're getting notes in 'conversation' mode to
 				# prioritise replies and reblogs that add text.
-				# We're not interested in the the names of authors that liked the post
+				# We're not interested in the names of authors that liked the post
 				# or who reblogged without adding content.
 				notes = self.client.notes(blog_id, id=post_id, before_timestamp=max_date, mode=mode)
 			except ConnectionRefusedError:
-				self.dataset.update_status("Couldn't get notes for post %s (ConnectionRefusedError), trying again" % post_id)
+				self.dataset.update_status(
+					"Couldn't get notes for post %s (ConnectionRefusedError), trying again" % post_id)
 				notes_retries += 1
 				time.sleep(10)
 				continue
@@ -847,7 +863,7 @@ def get_notes(self, blog_id, post_id, mode="conversation", max_reblogs=1000):
 						count += 1
 
 					post_notes.append(note)
-					
+
 					if count >= max_reblogs:
 						post_notes = post_notes[:count + note_metrics.get("reply_count", 0)]
 						stop_collecting = True
@@ -857,7 +873,7 @@ def get_notes(self, blog_id, post_id, mode="conversation", max_reblogs=1000):
 
 				if notes.get("_links"):
 					max_date = notes["_links"]["next"]["query_params"]["before_timestamp"]
-					
+
 					self.dataset.update_status("Collected %s %s for @%s:%s" % (count, note_type, blog_id, post_id))
 					time.sleep(.2)
 
@@ -894,9 +910,9 @@ def connect_to_tumblr(self):
 		"""
 		# User input keys
 		config_keys = [self.parameters.get("consumer_key"),
-			self.parameters.get("consumer_secret"),
-			self.parameters.get("key"),
-			self.parameters.get("secret_key")]
+					   self.parameters.get("consumer_secret"),
+					   self.parameters.get("key"),
+					   self.parameters.get("secret_key")]
 		if not all(config_keys):
 			# No user input keys; attempt to use 4CAT config keys
 			config_keys = self.get_tumblr_keys(self.owner)
@@ -927,12 +943,13 @@ def validate_query(query, request, user):
 		:param User user:  	User object of user who has submitted the query
 		:return dict:  		Safe query parameters
 		"""
+
 		# no query 4 u
 		if not query.get("query", "").strip():
 			raise QueryParametersException("You must provide a search query.")
 
 		# reformat queries to be a comma-separated list
-		items = query.get("query").replace("#","")
+		items = query.get("query").replace("#", "")
 		items = items.split("\n")
 
 		# Not more than 10 plox
@@ -964,12 +981,11 @@ def map_item(post):
 		Parse Tumblr posts.
 		Tumblr posts can be many different types, so some data processing is necessary.
 
-		:param posts, list:		List of Tumblr posts as returned form the Tumblr API.
+		:param post:		Tumblr post, as returned by the Tumblr API.
 	
-		:return dict:			Mapped item 
+		:return dict:		Mapped item
 		"""
 
-		media_types = ["photo", "video", "audio"]
 		image_urls = []
 		image_urls_reblogged = []
 		video_urls = []
@@ -983,19 +999,14 @@ def map_item(post):
 		answers = ""
 		raw_text = []
 		formatted_text = []
-		authors_reblogged = []
-		reblog_trail = []
 		body_reblogged = []
 		reblog_trail = []
 		body_ask = []
 		author_ask = ""
 		authors_replied = []
-		like_count = ""
 		replies = []
 		unknown_blocks = []
 
-		ordered_list_count = 1
-
 		# Sometimes the content order is reshuffled in the `layout` property,
 		# so we have to follow this.
 		content_order = []
@@ -1018,7 +1029,7 @@ def map_item(post):
 		# We're getting info as Neue Post Format types,
 		# so we need to loop through and join some content 'blocks'.
 		for i in content_order:
-			
+
 			block = post["content"][i]
 			block_type = block["type"]
 
@@ -1085,14 +1096,14 @@ def map_item(post):
 		# This includes reblogged content, but it's not entirely complete (e.g. no tags)
 		# so we'll only store the original blog name and its text + image content.
 		for i, reblog in enumerate(post.get("trail", [])):
-			
+
 			reblogged_text = []
 
 			if "broken_blog_name" in reblog:
 				reblog_author = reblog["broken_blog_name"]
 			else:
 				reblog_author = reblog["blog"]["name"]
-			
+
 			for reblog_block in reblog.get("content", []):
 				if reblog_block["type"] == "text":
 					reblogged_text.append(reblog_block["text"])
@@ -1102,7 +1113,7 @@ def map_item(post):
 			if not reblogged_text:
 				reblogged_text = ""
 			body_reblogged.append("\n".join(reblogged_text))
-			
+
 			reblog_trail.append(reblog_author)
 
 		return MappedItem({
@@ -1112,7 +1123,7 @@ def map_item(post):
 			"author_avatar_url": "https://api.tumblr.com/v2/blog/" + post["blog_name"] + "/avatar",
 			"thread_id": post["reblog_key"],
 			"timestamp": datetime.fromtimestamp(post["timestamp"]).strftime("%Y-%m-%d %H:%M:%S"),
-            "unix_timestamp": post["timestamp"],
+			"unix_timestamp": post["timestamp"],
 			"author_subject": post["blog"]["title"],
 			"author_description": strip_tags(post["blog"]["description"]),
 			"author_url": post["blog"]["url"],
@@ -1158,7 +1169,7 @@ def format_tumblr_text(text_content):
 		Format text content according to Tumblr's Neue Post Format definition.
 		Returns text as mardkown.
 
-		:param content, list:	The list of content as returned by the Tumblr API (can also be part of a `trail`)
+		:param text_content:	A list of `content` as returned by the Tumblr API (can also be part of a `trail`).
 		:returns dict
 
 		"""
@@ -1180,7 +1191,7 @@ def format_tumblr_text(text_content):
 					s = fmt["start"]
 					e = fmt["end"]
 
-					opening = True # To know if styles need to be appended or prepended
+					opening = True  # To know if styles need to be appended or prepended
 					for n in [s, e]:
 						insert_indexes.add(n)
 						n = str(n)
@@ -1203,9 +1214,10 @@ def format_tumblr_text(text_content):
 					n = int(n) + extra_chars
 					text = text[:n] + insert + text[n:]
 					extra_chars += len(insert)
-		
+
 		# Some more 'subtype' formatting
 		subtype = text_content.get("subtype")
+		ordered_list_count = 1
 		if subtype:
 			if subtype == "unordered-list-item":
 				text = "- " + text
@@ -1238,4 +1250,5 @@ def after_process(self):
 			errors.append("API error(s) when fetching reblogs %s" % ", ".join(self.failed_posts))
 		if errors:
 			self.dataset.log(";\n ".join(errors))
-			self.dataset.update_status(f"Dataset completed but failed to capture some notes/reblogs; see log for details")
+			self.dataset.update_status(
+				f"Dataset completed but failed to capture some notes/reblogs; see log for details")

From 12b54b1416fb4999806e864dc8a8524f74900941 Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Mon, 19 Aug 2024 13:08:29 +0200
Subject: [PATCH 128/204] Add hash function to helpers

---
 common/lib/helpers.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/common/lib/helpers.py b/common/lib/helpers.py
index 83e31fc4b..639d93df6 100644
--- a/common/lib/helpers.py
+++ b/common/lib/helpers.py
@@ -3,6 +3,7 @@
 """
 import subprocess
 import requests
+import hashlib
 import datetime
 import smtplib
 import fnmatch
@@ -897,3 +898,9 @@ def folder_size(path='.'):
         elif entry.is_dir():
             total += folder_size(entry.path)
     return total
+
+def hash_values(string: str) -> str:
+    """
+    Hash a string
+    """
+    return hashlib.md5(string.encode("utf-8")).hexdigest()
\ No newline at end of file

From 2e6185c9f56a0dbddf2a7f1dededafea2f234a6c Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Mon, 19 Aug 2024 13:08:49 +0200
Subject: [PATCH 129/204] Revert test code in count posts processor

---
 processors/metrics/count_posts.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/processors/metrics/count_posts.py b/processors/metrics/count_posts.py
index af32ed565..3ae077644 100644
--- a/processors/metrics/count_posts.py
+++ b/processors/metrics/count_posts.py
@@ -51,17 +51,12 @@ def process(self):
 		first_interval = "9999"
 		last_interval = "0000"
 
-		annotations = []
-
 		self.dataset.update_status("Processing items")
 		with self.dataset.get_results_path().open("w") as results:
 			counter = 0
 
 			for post in self.source_dataset.iterate_items(self):
 
-				annotation = {"value": "test", "item_id": post["id"]}
-				annotations.append(annotation)
-
 				try:
 					date = get_interval_descriptor(post, timeframe)
 				except ValueError as e:
@@ -152,7 +147,6 @@ def process(self):
 					row["value_relative"] = intervals[interval]["relative"]
 				rows.append(row)
 
-		self.write_annotations(annotations)
 		self.write_csv_items_and_finish(rows)
 
 	@classmethod

From 4ac3e62a2d848959d73fcdc580cbf374f416d3ca Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Mon, 19 Aug 2024 13:09:17 +0200
Subject: [PATCH 130/204] Change parameter in Jinja2 template

---
 webtool/templates/components/result-result-row.html | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/webtool/templates/components/result-result-row.html b/webtool/templates/components/result-result-row.html
index 06bd59290..331eecdf8 100644
--- a/webtool/templates/components/result-result-row.html
+++ b/webtool/templates/components/result-result-row.html
@@ -46,7 +46,7 @@
     {% endif %}
     {% if __user_config("privileges.can_use_explorer") and has_explorer %}
     <li>
-        <a href="{{ url_for('explorer_dataset', key=dataset.key) }}" class="tooltip-trigger" aria-controls="tooltip-explore-items-{{ dataset.key }}">
+        <a href="{{ url_for('explorer_dataset', dataset_key=dataset.key) }}" class="tooltip-trigger" aria-controls="tooltip-explore-items-{{ dataset.key }}">
             <i class="fa fa-binoculars" aria-hidden="true"></i> Explore & annotate
         </a>
         <p role="tooltip" id="tooltip-explore-items-{{ dataset.key }}" aria-hidden="true">Explore, sort, and annotate the dataset</p>

From 50cae616ab4ccde8d7b1810a92750fdee8a42301 Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Mon, 19 Aug 2024 18:21:23 +0200
Subject: [PATCH 131/204] Don't initialise Annotation() twice

---
 common/lib/annotation.py | 38 +++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/common/lib/annotation.py b/common/lib/annotation.py
index aadee1205..c12c53eeb 100644
--- a/common/lib/annotation.py
+++ b/common/lib/annotation.py
@@ -5,8 +5,8 @@
 
 import time
 import json
-import hashlib
 
+from common.lib.helpers import hash_values
 from common.lib.exceptions import AnnotationException
 
 class Annotation:
@@ -23,7 +23,6 @@ class Annotation:
     data = None
     db = None
 
-
     id = None                 # Unique ID for this annotation
     item_id = None            # ID of the item for this annotation, e.g. post ID
     field_id = None           # If of this type of annotation field for this dataset
@@ -51,7 +50,7 @@ def __init__(self, data=None, id=None, db=None):
         required_fields = ["label", "item_id", "dataset"]
 
         # Must have an ID or data
-        if id is None and (data is None or not isinstance(data, dict)):
+        if (id is None and data is None) or (data is not None and not isinstance(data, dict)):
             raise AnnotationException("Annotation() requires either a valid `data` dictionary or ID.")
 
         if not db:
@@ -69,7 +68,7 @@ def __init__(self, data=None, id=None, db=None):
             if data and "id" in data:
                 id = data["id"]
             self.id = id # IDs correspond to unique serial numbers in the database.
-            current = self.db.fetchone("SELECT * FROM annotations WHERE id = %s" % (self.id))
+            current = self.get_by_id(id)
             if not current:
                 raise AnnotationException(
                     "Annotation() requires a valid ID for an existing annotation, %s given" % id)
@@ -86,9 +85,6 @@ def __init__(self, data=None, id=None, db=None):
 
         # If we were able to retrieve an annotation from the db, it already exists
         if current:
-
-            #current["metadata"] = json.loads(current["metadata"])
-
             # Check if we have to overwrite old data with new data
             if data:
                 for key, value in data.items():
@@ -105,7 +101,6 @@ def __init__(self, data=None, id=None, db=None):
 
         # If this is a new annotation, set all the properties.
         else:
-
             # Keep track of when the annotation was made
             created_timestamp = int(time.time())
 
@@ -142,7 +137,6 @@ def __init__(self, data=None, id=None, db=None):
                     v = bool(v)
             except ValueError as e:
                 raise AnnotationException("Annotation fields are not of the right type (%s)" % e)
-
             self.__setattr__(k, v)
 
         # Write to db if anything changed
@@ -150,13 +144,13 @@ def __init__(self, data=None, id=None, db=None):
             self.timestamp = int(time.time())
             self.write_to_db()
 
-    def get_by_id(id: int, db):
+    def get_by_id(self, id: int):
         """
         Get annotation by ID
 
         :param str id:  ID of annotation
         :param db:      Database connection object
-        :return:  Annotation object, or `None` for invalid annotation ID
+        :return:  Annotation object, or an empty dict if the ID doesn't exist.
         """
 
         try:
@@ -164,18 +158,24 @@ def get_by_id(id: int, db):
         except ValueError:
             raise AnnotationException("Id '%s' is not valid" % id)
 
-        return Annotation(id=id, db=db)
+        data = self.db.fetchone("SELECT * FROM annotations WHERE id = %s" % (id))
+
+        if not data:
+            return {}
+
+        data["metadata"] = json.loads(data["metadata"])
+        return data
 
     def get_by_field(self, dataset_key: str, item_id: str, label: str) -> dict:
         """
-        Get the annotation information via its dataset key, item ID, and label.
+        Get the annotation information via its dataset key, item ID, and field_id.
         This is always a unique combination.
 
         :param dataset_key:     The key of the dataset this annotation was made for.
         :param item_id:         The ID of the item this annotation was made for.
         :param label:           The label of the annotation.
 
-        :return data: A dict with data of the retrieved annotation, or None if it doesn't exist.
+        :return data: A dict with data of the retrieved annotation, or an empty dict if it doesn't exist.
         """
 
         data = self.db.fetchone("SELECT * FROM annotations WHERE dataset = %s AND item_id = %s AND label = %s",
@@ -195,8 +195,8 @@ def set_field_id(self, dataset_key: str, label: str) -> str:
         :param label:       The label of the dataset.
         """
 
-        field_id = source_dataset.key + annotation["label"]
-        field_id = hashlib.md5(field_id.encode("utf-8")).hexdigest()
+        base_field_id = dataset_key + label
+        field_id = hash_values(base_field_id)
         self.field_id = field_id
         return self.field_id
 
@@ -205,6 +205,7 @@ def write_to_db(self):
         Write an annotation to the database.
         """
         db_data = self.data
+        db_data["timestamp"] = int(time.time())
         m = db_data["metadata"] # To avoid circular reference error
         db_data["metadata"] = json.dumps(m)
         return self.db.upsert("annotations", data=db_data, constraints=["label", "dataset", "item_id"])
@@ -262,8 +263,8 @@ def update_annotations_via_fields(dataset_key, old_fields: dict, new_fields: dic
         if old_fields == new_fields:
             return 0
 
-        fields_to_delete = set()  # Delete all annotations with this field ID
-        fields_to_update = {}  # Update values of annotations with this field ID
+        fields_to_delete = set()        # Delete all annotations with this field ID
+        fields_to_update = {}           # Update values of annotations with this field ID
 
         # Loop through the old annotation fields
         for old_field_id, old_field in old_fields.items():
@@ -418,7 +419,6 @@ def __setattr__(self, attr, value):
         if attr == "metadata":
             value = json.dumps(value)
 
-        self.timestamp = int(time.time())
         self.db.update("annotations", where={"id": self.id}, data={attr: value})
 
         self.data[attr] = value

From cdbe6ed1cd07b09929264b6579bccd994c09a7e5 Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Mon, 19 Aug 2024 18:21:57 +0200
Subject: [PATCH 132/204] Clean up and revert some JS

---
 webtool/static/js/explorer.js | 117 ++++++++++++++++------------------
 1 file changed, 56 insertions(+), 61 deletions(-)

diff --git a/webtool/static/js/explorer.js b/webtool/static/js/explorer.js
index 6648c8ced..10c69423b 100644
--- a/webtool/static/js/explorer.js
+++ b/webtool/static/js/explorer.js
@@ -22,15 +22,14 @@ const annotations = {
 
 	init: function() {
 
-		let edit_field_box = $("#edit-annotation-fields");
 		let editor = $("#annotation-fields-editor");
 		let editor_controls = $("#annotation-fields-editor-controls");
 		var edits_made = false;
 
 		// Add a new annotation field when clicking the plus icon
 		$("#new-annotation-field").on("click", function(){
-			let annotations_div = annotations.getAnnotationsDiv();
-			$(annotations_div).insertBefore(edit_field_box);});
+			annotations.addAnnotationField();
+		});
 
 		// Show and hide the annotations editor
 		let toggle_fields = $("#toggle-annotation-fields")
@@ -89,7 +88,7 @@ const annotations = {
 		
 		// Make enter apply the option fields
 		editor_controls.on("keypress", "input", function(e){
-			if (e.which == 13) {
+			if (e.which === 13) {
 				annotations.applyAnnotationFields();
 			}
 		});
@@ -121,7 +120,7 @@ const annotations = {
 
 		// Save unsaved annotations upon changing a page.
 		$('.page > a').click(function(){
-			if (!$("#save-annotations").hassClass('disabled')) {
+			if (!$("#save-annotations").hasClass('disabled')) {
 				annotations.saveAnnotations();
 			}
 		})
@@ -146,11 +145,11 @@ const annotations = {
 		let options = $(el).parent().parent().next();
 		let option_fields = options.find(".option-field");
 
-		if (type == "text" || type == "textarea") {
+		if (type === "text" || type === "textarea") {
 			option_fields.remove();
 		}
-		else if (type == "dropdown" || type == "checkbox") {
-			if (option_fields.length == 0) {
+		else if (type === "dropdown" || type === "checkbox") {
+			if (option_fields.length === 0) {
 				options.append(annotations.getInputField);
 			}
 		}
@@ -218,9 +217,9 @@ const annotations = {
 		Returns an object with the set annotation fields.
 		*/
 
-		var annotation_fields = {};
-		var warning = "";
-		var labels_added = []
+		let annotation_fields = {};
+		let warning = "";
+		let labels_added = []
 
 		annotations.warnEditor("");
 
@@ -239,12 +238,12 @@ const annotations = {
 
 			// Get the ID of the field, so we
 			// can later check if it already exists.
-			let field_id = parseInt(this.id.split("-")[1]);
+			let field_id = this.id.split("-")[1];
 
 			// Make sure the inputs have a label
 			if (!label.length > 0) {
 				label_field.addClass("invalid");
-				warning  = "Input names can't be empty";
+				warning  = "Field labels can't be empty";
 			}
 			// Make sure the names can't be duplicates
 			else if (labels_added.includes(label)) {
@@ -254,7 +253,7 @@ const annotations = {
 
 			// We can't add field labels that are also existing column names
 			else if (original_columns.includes(label)) {
-				warning = "Fields labels cannot be an existing column name";
+				warning = "Field label " + label + " is already present as a dataset item, please rename.";
 				label_field.addClass("invalid");
 			}
 
@@ -264,7 +263,7 @@ const annotations = {
 			// Keep track of the labels we've added
 			labels_added.push(label)
 
-			if (type == "text" || type == "textarea") {				
+			if (type === "text" || type === "textarea") {
 				annotation_fields[field_id] = {"type": type, "label": label};
 			}
 			// Add options for dropdowns and checkboxes
@@ -390,8 +389,8 @@ const annotations = {
 		// Applies the annotation fields to each post on this page.
 
 		// First we collect the annotation information from the editor
-		var annotation_fields = annotations.parseAnnotationFields(e);
-		var fields_to_add = {};
+		let annotation_fields = annotations.parseAnnotationFields(e);
+		let fields_to_add = {};
 
 		// Show an error message if the annotation fields were not valid.
 		if (typeof annotation_fields == "string") {
@@ -426,7 +425,7 @@ const annotations = {
 				}
 			});
 
-			// Add input fields to every posts in the explorer.
+			// Add input fields to every post in the explorer.
 			// We take the annotations of the first post to check
 			// what's the current state and add them to every post after.
 			let text_fields = ["textarea", "text"];
@@ -447,7 +446,7 @@ const annotations = {
 					// Edit the labels if they have changed.
 					label_span = $(class_id + " > .annotation-label");
 					label = label_span.first().text();
-					if (label != input_label) {
+					if (label !== input_label) {
 						label_span.each(function(){
 							$(this).text(input_label);
 						});
@@ -460,7 +459,7 @@ const annotations = {
 
 					// If the change is between a textbox and textarea,
 					// change the input type and carry over the text.
-					if (input_type != old_input_type) {
+					if (input_type !== old_input_type) {
 
 						if (text_fields.includes(input_type) && text_fields.includes(old_input_type)) {
 							
@@ -473,11 +472,11 @@ const annotations = {
 								}
 
 								// Replace the HTML element, insert old values, and change the type class
-								if (input_type == "text" && old_input_type == "textarea") {
+								if (input_type === "text" && old_input_type === "textarea") {
 									$(this).parent().removeClass("textarea").addClass("text");
 									$(this).replaceWith($("<input type='text' class='post-annotation-input text-" + field + "'>").val(add_val));
 								}
-								else if (input_type == "textarea" && old_input_type == "text") {
+								else if (input_type === "textarea" && old_input_type === "text") {
 									$(this).parent().removeClass("text").addClass("textarea");
 									$(this).replaceWith($("<textarea class='post-annotation-input textarea-" + field + "'>" + add_val + "</textarea>"));
 								}
@@ -662,26 +661,25 @@ const annotations = {
 		// Save the annotation fields used for this dataset
 		// to the datasets table.
 
-		if (annotation_fields.length < 1 || annotation_fields == undefined) {
-			annotation_fields = annotation_fields.parseAnnotationFields;
+		if (annotation_fields.length < 1) {
+			return;
 		}
 
 		// If there's annotation fields, we can enable/disable the buttons
 		annotations.fieldsExist();
 
-		var dataset_key = $("#dataset-key").text();
-		var json_annotations = JSON.stringify(annotation_fields);
+		let dataset_key = $("#dataset-key").text();
 
 		// AJAX the annotation forms
 		$.ajax({
 			url: getRelativeURL("explorer/save_annotation_fields/" + dataset_key),
 			type: "POST",
 			contentType: "application/json",
-			data: json_annotations,
+			data:  JSON.stringify(annotation_fields),
 
 			success: function (response) {
 				// If the query is accepted by the server.
-				if (response == 'success') {
+				if (response === 'success') {
 					$("#annotations-editor-container").hide();
 					$("#apply-annotation-fields").addClass("disabled");
 				}
@@ -821,43 +819,39 @@ const annotations = {
 		pa.animate({"height": 0}, 250);
 	},
 
-	getAnnotationsDiv: function(id){
-		// Returns an input field element with a pseudo-random ID, if none is provided.
-		if (id == undefined || id == 0) {
-			id = annotations.randomInt();
-		}
-		
-		// Returns an annotation div element with a pseudo-random ID
-		return `<div>
-			<dd class="annotation-fields-row annotation-field" id="field-{{FIELD_ID}}">
-				<input type="text" class="annotation-field-label" name="annotation-field-label" placeholder="Field label">
-				<a class="button-like-small delete-input"><i class="fas fa-trash"></i></a>
-			</dd>
-		</div>
-		<div>
-			<dd>
-				<select name="annotation-field-type" class="annotation-field-type">
-					<option class="annotation-field-option" value="text" selected>Text</option>
-					<option class="annotation-field-option" value="textarea">Text (large)</option>
-					<option class="annotation-field-option" value="checkbox">Checkbox</option>
-					<option class="annotation-field-option" value="dropdown">Dropdown</option>
-				</select>
-			</dd>
-		</div>
-		<div></div>`.replace("{{FIELD_ID}}", id);
+	addAnnotationField: function(){
+		/*
+		Adds an annotation field input element;
+		these have no IDs yet, we'll add a hashed database-label string when saving.
+		*/
+
+		let annotation_field = `<div>
+             <dd class="annotation-fields-row annotation-field">
+                 <input type="text" id="field-undefinedrandomint"
+                 class="annotation-field-label" name="annotation-field-label" placeholder="Field label">
+                 <a class="button-like-small delete-input"><i class="fas fa-trash"></i></a>
+             </dd>
+         </div>
+         <div>
+             <dd>
+                 <select name="annotation-field-type" class="annotation-field-type">
+                     <option class="annotation-field-option" value="text" selected>Text</option>
+                     <option class="annotation-field-option" value="textarea">Text (large)</option>
+                     <option class="annotation-field-option" value="checkbox">Checkbox</option>
+                     <option class="annotation-field-option" value="dropdown">Dropdown</option>
+                 </select>
+             </dd>
+         </div><div></div>`.replace("randomint", Math.floor(Math.random() * 100000000).toString());
+		$(annotation_field).insertBefore($("#edit-annotation-fields"));
 	},
 
 	getInputField: function(id){
-		// Returns an input field element with a pseudo-random ID, if none is provided.
-		if (id == undefined || id == 0) {
-			id = annotations.randomInt();
+		// Returns an option field element with a pseudo-random ID, if none is provided.
+		if (id === undefined || id === 0) {
+			id = Math.floor(Math.random() * 100000000).toString();
 		}
 		return "<div class='option-field'><input type='text' id='input-" + id + "' placeholder='Value'></div>";
 	},
-
-	randomInt: function(){
-		return Math.floor(Math.random() * 100000000);
-	}
 };
 
 const page_functions = {
@@ -872,13 +866,14 @@ const page_functions = {
 
 		// Reorder the dataset when the sort type is changed
 		$(".sort-select").on("change", function(){
-			
+
+
 			// Get the column to sort on, an whether we should sort in reverse.
 			let selected = $("#column-sort-select").find("option:selected").val();
 			let order = $("#column-sort-order").find("option:selected").val();
 
 			sort_order = ""
-			if (order == "reverse"){
+			if (order === "reverse"){
 				sort_order = "&order=reverse"
 			}
 

From 851e067f0f1b25a464a00672b7fee09908d2d083 Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Mon, 19 Aug 2024 18:22:13 +0200
Subject: [PATCH 133/204] Separate annoatation field into a component

---
 .../templates/explorer/annotation-field.html  | 40 ++++++++++++++++
 .../explorer/annotations-editor.html          | 47 ++-----------------
 2 files changed, 45 insertions(+), 42 deletions(-)
 create mode 100644 webtool/templates/explorer/annotation-field.html

diff --git a/webtool/templates/explorer/annotation-field.html b/webtool/templates/explorer/annotation-field.html
new file mode 100644
index 000000000..1f944d366
--- /dev/null
+++ b/webtool/templates/explorer/annotation-field.html
@@ -0,0 +1,40 @@
+{% set annotation_type = annotation_field["type"] %}
+
+{% set label = annotation_field["label"] %}
+
+<div>
+    <dd class="annotation-fields-row annotation-field" id="field-{{ field }}">
+        <input type="text" class="annotation-field-label" name="annotation-field-label" placeholder="Field label" value="{{ label }}">
+        <a class="button-like-small delete-input"><i class="fas fa-trash"></i></a>
+    </dd>
+</div>
+<div>
+    <dd>
+        <select name="annotation-field-type" class="annotation-field-type">
+            <option class="annotation-field-option" value="text" {% if annotation_type == "text" %}selected{% endif %}>Text</option>
+            <option class="annotation-field-option" value="textarea" {% if annotation_type == "textarea" %}selected{% endif %}>Text (large)</option>
+            <option class="annotation-field-option" value="checkbox" {% if annotation_type == "checkbox" %}selected{% endif %}>Checkbox</option>
+            <option class="annotation-field-option" value="dropdown" {% if annotation_type == "dropdown" %}selected{% endif %}>Dropdown</option>
+        </select>
+    </dd>
+</div>
+
+{% if annotation_type == "dropdown" or annotation_type == "checkbox" %}
+<div class="option-fields">
+    <dd>
+            {% for option in annotation_fields[field]["options"] %}
+            {% set option_id = option.keys() | first %}
+            {% set option_label = option.values() | first %}
+            <div class="option-field">
+                <input type="text" id="input-{{ option_id }}" value="{{ option_label }}">
+                <a class="button-like-small delete-option-field"><i class="fas fa-trash"></i></a>
+            </div>
+            {% endfor %}
+            <div class="option-field">
+                <input type="text" id="input-{{ range(1, 100000000) | random }}" placeholder="Value">
+            </div>
+    </dd>
+</div>
+{% else %}
+<div class="option-fields"></div>
+{% endif %}
\ No newline at end of file
diff --git a/webtool/templates/explorer/annotations-editor.html b/webtool/templates/explorer/annotations-editor.html
index cf356f542..bb75e6bff 100644
--- a/webtool/templates/explorer/annotations-editor.html
+++ b/webtool/templates/explorer/annotations-editor.html
@@ -27,49 +27,12 @@
 	</div>
 
 	{% if annotation_fields %}
+        {% for field in annotation_fields %}
+            {% set annotation_field = annotation_fields[field] %}
+            {% include "explorer/annotation-field.html" %}
+        {% endfor %}
+    {% endif %}
 
-	{% for field in annotation_fields %}
-	{% set annotation_type = annotation_fields[field]["type"] %}
-	{% set label = annotation_fields[field]["label"] %}
-
-	<div>
-		<dd class="annotation-fields-row annotation-field" id="field-{{ field }}">
-			<input type="text" class="annotation-field-label" name="annotation-field-label" placeholder="Field label" value="{{ label }}">
-			<a class="button-like-small delete-input"><i class="fas fa-trash"></i></a>
-		</dd>
-	</div>
-	<div>
-		<dd>
-			<select name="annotation-field-type" class="annotation-field-type">
-				<option class="annotation-field-option" value="text" {% if annotation_type == "text" %}selected{% endif %}>Text</option>
-				<option class="annotation-field-option" value="textarea" {% if annotation_type == "textarea" %}selected{% endif %}>Text (large)</option>
-				<option class="annotation-field-option" value="checkbox" {% if annotation_type == "checkbox" %}selected{% endif %}>Checkbox</option>
-				<option class="annotation-field-option" value="dropdown" {% if annotation_type == "dropdown" %}selected{% endif %}>Dropdown</option>
-			</select>
-		</dd>
-	</div>
-
-	{% if annotation_type == "dropdown" or annotation_type == "checkbox" %}
-	<div class="option-fields">
-		<dd>
-				{% for option in annotation_fields[field]["options"] %}
-				{% set option_id = option.keys() | first %}
-				{% set option_label = option.values() | first %}
-				<div class="option-field">
-					<input type="text" id="input-{{ option_id }}" value="{{ option_label }}">
-					<a class="button-like-small delete-option-field"><i class="fas fa-trash"></i></a>
-				</div>
-				{% endfor %}
-				<div class="option-field">
-					<input type="text" id="input-{{ range(1, 100000000) | random }}" placeholder="Value">
-				</div>
-		</dd>
-	</div>
-	{% else %}
-	<div class="option-fields"></div>
-	{% endif %}
-	{% endfor %}
-	{% endif %}
 	<div id="edit-annotation-fields">
 		<dd>
 			<a class="button-like-small" id="new-annotation-field"><i class="fas fa-plus"></i> New field</a>

From f0a97081d5aa70a2a70019b391a1b2bebe145ba7 Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Mon, 19 Aug 2024 18:50:45 +0200
Subject: [PATCH 134/204] Make processor and Explorer annotation features
 co-exist peacefully

---
 backend/lib/processor.py        |  19 +++---
 common/lib/dataset.py           |  86 +++++++++++++++------------
 webtool/views/views_explorer.py | 101 +++++++++++++++++++++-----------
 3 files changed, 123 insertions(+), 83 deletions(-)

diff --git a/backend/lib/processor.py b/backend/lib/processor.py
index 3ee7704d1..47dbcf3c9 100644
--- a/backend/lib/processor.py
+++ b/backend/lib/processor.py
@@ -3,7 +3,6 @@
 """
 import re
 import traceback
-import hashlib
 import zipfile
 import typing
 import shutil
@@ -11,13 +10,14 @@
 import abc
 import csv
 import os
+import random
 
 from pathlib import Path, PurePath
 
 from backend.lib.worker import BasicWorker
 from common.lib.dataset import DataSet
 from common.lib.fourcat_module import FourcatModule
-from common.lib.helpers import get_software_commit, remove_nuls, send_email
+from common.lib.helpers import get_software_commit, remove_nuls, send_email, hash_values
 from common.lib.exceptions import (WorkerInterruptedException, ProcessorInterruptedException, ProcessorException,
 								   DataSetException, MapItemException)
 from common.config_manager import config, ConfigWrapper
@@ -739,10 +739,9 @@ def write_annotations(self, annotations: list, source_dataset=None, overwrite=Fa
 		if not source_dataset:
 			source_dataset = self.source_dataset
 
-		# Create a field ID based on the
+		already_exists_error = False
 
 		# Check if this dataset already has annotation fields
-		field_id = ""
 		existing_labels = source_dataset.get_annotation_field_labels()
 
 		# Set some values
@@ -756,6 +755,10 @@ def write_annotations(self, annotations: list, source_dataset=None, overwrite=Fa
 				if not overwrite and label in existing_labels:
 					label += "-" + str(len([l for l in existing_labels if l.startswith(label)]))
 				annotation["label"] = label
+			elif annotation.get("label") and not overwrite:
+				if annotation["label"] in existing_labels:
+					already_exists_error = annotation["label"]
+					break
 
 			# Set the author to this processor's name
 			if not annotation.get("author"):
@@ -768,11 +771,9 @@ def write_annotations(self, annotations: list, source_dataset=None, overwrite=Fa
 				annotation["metadata"] = {}
 			annotation["metadata"]["processor-parameters"] = self.parameters
 
-			if not annotation.get("field_id"):
-				if not field_id:
-					field_id = source_dataset.key + annotation["label"]
-					field_id = hashlib.md5(field_id.encode("utf-8")).hexdigest()
-				annotation["field_id"] = field_id
+		if already_exists_error:
+			self.dataset.finish_with_error(
+				"Annotation label '%s' already exists for this dataset" % already_exists_error)
 
 		annotations_saved = source_dataset.save_annotations(annotations, overwrite=overwrite)
 		return annotations_saved
diff --git a/common/lib/dataset.py b/common/lib/dataset.py
index 56aeb5b5e..56f92601f 100644
--- a/common/lib/dataset.py
+++ b/common/lib/dataset.py
@@ -1,7 +1,6 @@
 import collections
 import itertools
 import datetime
-import hashlib
 import fnmatch
 import random
 import shutil
@@ -14,7 +13,7 @@
 from common.config_manager import config
 from common.lib.annotation import Annotation
 from common.lib.job import Job, JobNotFoundException
-from common.lib.helpers import get_software_commit, NullAwareTextIOWrapper, convert_to_int
+from common.lib.helpers import get_software_commit, NullAwareTextIOWrapper, convert_to_int, hash_values
 from common.lib.item_mapping import MappedItem, DatasetItem
 from common.lib.fourcat_module import FourcatModule
 from common.lib.exceptions import (ProcessorInterruptedException, DataSetException, DataSetNotFoundException,
@@ -965,7 +964,7 @@ def get_key(self, query, parameters, parent="", time_offset=0):
 
 		parent_key = str(parent) if parent else ""
 		plain_key = repr(param_key) + str(query) + parent_key
-		hashed_key = hashlib.md5(plain_key.encode("utf-8")).hexdigest()
+		hashed_key = hash_values(plain_key)
 
 		if self.db.fetchone("SELECT key FROM datasets WHERE key = %s", (hashed_key,)):
 			# key exists, generate a new one
@@ -1584,17 +1583,17 @@ def get_annotations(self, item_id=[]) -> list:
 		Retrieves the annotations for this dataset.
 
 		:param item_id:	A list of item IDs to get the annotations from.
-						If empty, get all the annotations for this dataset.
-						May also be a string to get annotations from a specific item.
+						May also be a string or int to get a specific annotation.
+						If left empty, get all the annotations for this dataset.
 
-		return list: All annotations, each in their own dictionary.
+		return list: 	List of Annotation objects.
 		"""
 
 		annotations = []
 
 		# Get annotation IDs first
 		if item_id:
-			# Get specific annotations if IDs are given
+			# Cast to string
 			if isinstance(item_id, str) or isinstance(item_id, int):
 				item_id = [item_id]
 			item_id = [str(i) for i in item_id]
@@ -1607,11 +1606,10 @@ def get_annotations(self, item_id=[]) -> list:
 		if not ids:
 			return []
 
-		ids = [i["id"] for i in ids]
-
 		# Then get the annotations by ID
+		ids = [i["id"] for i in ids]
 		for id in ids:
-			annotations.append(Annotation.get_by_id(id, self.db))
+			annotations.append(Annotation(id=id, db=self.db))
 
 		return annotations
 
@@ -1678,53 +1676,57 @@ def save_annotations(self, annotations: list, overwrite=True) -> int:
 		count = 0
 		annotation_fields = self.get_annotation_fields()
 		annotation_labels = self.get_annotation_field_labels()
-		known_field_ids = {} # Just so we don't have to hash every annotation without a field ID
+
+		field_id = ""
+		salt = str(random.randrange(0, 1000000))
 
 		# Add some dataset data to annotations, if not present
-		for annotation in annotations:
+		for annotation_data in annotations:
 
 			# Check if the required fields are present
-			if "item_id" not in annotation:
+			if "item_id" not in annotation_data:
 				raise AnnotationException("Can't save annotations; annotation must have an `item_id` referencing "
-										  "the item they annotated, got %s" % annotation)
-			if "label" not in annotation or not isinstance(annotation["label"], str):
+										  "the item it annotated, got %s" % annotation_data)
+			if "label" not in annotation_data or not isinstance(annotation_data["label"], str):
 				raise AnnotationException("Can't save annotations; annotation must have a `label` field, "
 										  "got %s" % annotation)
-			if not overwrite and annotation["label"] in annotation_labels:
+			if not overwrite and annotation_data["label"] in annotation_labels:
 				raise AnnotationException("Can't save annotations; annotation field with label %s "
-										  "already exists" % annotation["label"])
+										  "already exists" % annotation_data["label"])
 
 			# Set dataset key
-			if not annotation.get("dataset"):
-				annotation["dataset"] = self.key
-
-			# If not present, add an ID for this annotation field, based on the dataset key and label
-			if "field_id" not in annotation:
-				field_id_str = annotation["label"] + annotation["dataset"]
-				# Check if we hashed this before
-				if field_id_str in known_field_ids:
-					field_id = known_field_ids[field_id_str]
-				else:
-					field_id = hashlib.md5(field_id_str.encode("utf-8")).hexdigest()
-				annotation["field_id"] = field_id
+			if not annotation_data.get("dataset"):
+				annotation_data["dataset"] = self.key
 
 			# Set default author to this dataset owner
 			# If this annotation is made by a processor, it will have the processor name
-			if not annotation.get("author"):
-				annotation["author"] = self.get_owners()[0]
+			if not annotation_data.get("author"):
+				annotation_data["author"] = self.get_owners()[0]
+
+			# The field ID can already exists for the same dataset/key combo,
+			# if a previous label has been renamed.
+			# If we're not overwriting, create a new key with some salt.
+			if not overwrite:
+				if not field_id:
+					field_id = hash_values(annotation_data["dataset"] + annotation_data["label"] + salt)
+				if field_id in annotation_fields:
+					annotation_data["field_id"] = field_id
+
+			# Create Annotation object, which also saves it to the database
+			# If this dataset/item ID/label combination already exists, this retrieves the
+			# existing data and updates it with new values.
+			annotation = Annotation(data=annotation_data, db=self.db)
 
 			# Add data on the type of annotation field, if it is not saved to the datasets table yet.
 			# For now this is just a simple dict with a field ID, type, label, and possible options.
-			if not annotation_fields or annotation["field_id"] not in annotation_fields:
-				annotation_fields[annotation["field_id"]] = {
-					"label": annotation["label"],
-					"type": annotation.get("type", "text") # Default to text
+			if not annotation_fields or annotation.field_id not in annotation_fields:
+				annotation_fields[annotation.field_id] = {
+					"label": annotation.label,
+					"type": annotation.type		# Defaults to `text`
 				}
-				if "options" in annotation:
-					annotation_fields[annotation["field_id"]]["options"] = annotation["options"]
+				if annotation.options:
+					annotation_fields[annotation.options] = annotation.options
 
-			# Create Annotation object, which also saves it to the database
-			Annotation(data=annotation, db=self.db)
 			count += 1
 
 		# Save annotation fields if things changed
@@ -1799,6 +1801,12 @@ def save_annotation_fields(self, new_fields: dict, add=False) -> int:
 				if old_fields[field_id] != annotation_field:
 					changes = True
 
+		# Check if fields are removed
+		if not add:
+			for field_id in old_fields.keys():
+				if field_id not in new_fields:
+					changes = True
+
 		# If we're just adding fields, add them to the old fields.
 		# If the field already exists, overwrite the old field.
 		if add and old_fields:
diff --git a/webtool/views/views_explorer.py b/webtool/views/views_explorer.py
index 88a2b64d8..abc9c5075 100644
--- a/webtool/views/views_explorer.py
+++ b/webtool/views/views_explorer.py
@@ -3,14 +3,16 @@
 format and lets users annotate the data.
 """
 
+import json
+
 from pathlib import Path
 
-from flask import request, render_template
+from flask import request, render_template, jsonify
 from flask_login import login_required, current_user
 from webtool import app, db, openapi, limiter, config
 from webtool.lib.helpers import error, setting_required
 from common.lib.dataset import DataSet
-from common.lib.helpers import convert_to_float
+from common.lib.helpers import convert_to_float, hash_values
 from common.lib.exceptions import DataSetException
 from common.config_manager import ConfigWrapper
 
@@ -90,7 +92,7 @@ def explorer_dataset(dataset_key: str, page=1):
 
 	# We don't need to sort if we're showing the existing dataset order (default).
 	# If we're sorting, we need to iterate over the entire dataset first.
-	if not sort or (sort == "dataset-order" and reverse == False):
+	if not sort or (sort == "dataset-order" and not reverse):
 		for row in dataset.iterate_items(warn_unmappable=False):
 
 			count += 1
@@ -139,32 +141,43 @@ def explorer_dataset(dataset_key: str, page=1):
 	# Generate the HTML page
 	return render_template("explorer/explorer.html", dataset=dataset, datasource=datasource, has_database=has_database, posts=posts, annotation_fields=annotation_fields, annotations=annotations, template=template, posts_css=posts_css, page=page, offset=offset, posts_per_page=posts_per_page, post_count=post_count, max_posts=max_posts, warning=warning)
 
-@app.route("/explorer/save_annotation_fields/<string:key>", methods=["POST"])
+@app.route("/explorer/save_annotation_fields/<string:dataset_key>", methods=["POST"])
 @api_ratelimit
 @login_required
 @setting_required("privileges.can_run_processors")
 @setting_required("privileges.can_use_explorer")
 @openapi.endpoint("explorer")
-def explorer_save_annotation_fields(key: str) -> int:
+def explorer_save_annotation_fields(dataset_key: str) -> str:
 	"""
 	Save the annotation fields of a dataset to the datasets table.
 
-	:param key:  		The dataset key.
+	:param dataset_key:  		The dataset key.
 
 	:return-error 404:  If the dataset ID does not exist.
 	:return int:		The number of annotation fields saved.
 	"""
 
 	# Get dataset.
-	if not key:
+	if not dataset_key:
 		return error(404, error="No dataset key provided")
 	try:
-		dataset = DataSet(key=key, db=db)
+		dataset = DataSet(key=dataset_key, db=db)
 	except DataSetException:
 		return error(404, error="Dataset not found.")
 
 	# Save it!
 	annotation_fields = request.get_json()
+
+	# Field IDs are not immediately set in the front end.
+	# We're going to do this based on the hash of the
+	# dataset key and the input label (should be unique)
+	field_keys = list(annotation_fields.keys())
+	for field_id in field_keys:
+		if "undefined" in field_id:
+			new_field_id = hash_values(dataset_key + annotation_fields[field_id]["label"])
+			annotation_fields[new_field_id] = annotation_fields[field_id]
+			del annotation_fields[field_id]
+
 	dataset.save_annotation_fields(annotation_fields)
 
 	return "success"
@@ -196,19 +209,32 @@ def explorer_save_annotations(dataset_key: str):
 	dataset.save_annotations(annotations, overwrite=True)
 	return "success"
 
-@app.route("/explorer/save_annotation/<string:key>", methods=["POST"])
+
+@app.route("/explorer/get_annotation_field", methods=["GET"])
 @api_ratelimit
 @login_required
 @setting_required("privileges.can_run_processors")
 @setting_required("privileges.can_use_explorer")
 @openapi.endpoint("explorer")
-def explorer_save_annotation(key="") -> int:
+def get_annotation_field():
 	"""
-	todo: integrate
+	Returns an annotation field input div
+
+	:return-error 406:  If the list of subqueries could not be parsed.
 	"""
-	return 0
+	try:
+		annotation_field = json.loads(request.args.get("annotation_field"))
+	except (TypeError, json.decoder.JSONDecodeError):
+		return error(406, error="Unexpected format for annotation field.")
+
+	html = render_template("explorer/annotation-field.html", annotation_field=annotation_field)
+	return jsonify({
+		"status": "success",
+		"html": html}
+	)
+
 
-def sort_and_iterate_items(dataset: DataSet, sort=None, reverse=False, **kwargs) -> dict:
+def sort_and_iterate_items(dataset: DataSet, sort="", reverse=False, **kwargs) -> dict:
 	"""
 	Loop through both csv and NDJSON files.
 	This is basically a wrapper function for `iterate_items()` with the
@@ -217,9 +243,9 @@ def sort_and_iterate_items(dataset: DataSet, sort=None, reverse=False, **kwargs)
 	This first iterates through the entire file (with a max limit) to determine
 	an order. Then it yields items based on this order.
 
-	:param dataset, str:		The dataset object.
-	:param sort_by, str:		The item key that determines the sort order.
-	:param reverse, bool:		Whether to sort by largest values first.
+	:param dataset:				The dataset object.
+	:param sort:				The item key that determines the sort order.
+	:param reverse:				Whether to sort by largest values first.
 
 	:returns dict:				Yields iterated post
 	"""
@@ -228,7 +254,7 @@ def sort_and_iterate_items(dataset: DataSet, sort=None, reverse=False, **kwargs)
 	sorted_posts = []
 
 	# Use reversed() if we're reading the dataset from back to front.
-	if sort == "dataset-order" and reverse == True:
+	if sort == "dataset-order" and reverse:
 		for item in reversed(list(dataset.iterate_items(**kwargs))):
 			sorted_posts.append(item)
 
@@ -247,12 +273,33 @@ def sort_and_iterate_items(dataset: DataSet, sort=None, reverse=False, **kwargs)
 	for post in sorted_posts:
 		yield post
 
+
+def has_datasource_template(datasource: str) -> bool:
+	"""
+	Check if the data source has a data source-specific template.
+	This requires HTML and CSS files.
+	Custom HTML files should be placed in `webtool/templates/explorer/datasource-templates/<datasource name>.html`.
+	Custom CSS files should be placed in `webtool/static/css/explorer/<datasource name>.css`.
+
+	:param datasource:	Datasource name.
+
+	:returns: bool, Whether the required files are present.
+	"""
+	css_exists = Path(config.get('PATH_ROOT'), "webtool/static/css/explorer/" + datasource + ".css").exists()
+	html_exists = Path(config.get('PATH_ROOT'), "webtool/templates/explorer/datasource-templates/" + datasource + ".html").exists()
+
+	if css_exists and html_exists:
+		return True
+	return False
+
 def get_database_posts(db, datasource, ids, board="", threads=False, limit=0, offset=0, order_by=["timestamp"]):
 	"""
 	todo: Integrate later
 	Retrieve posts by ID from a database-accessible data source.
 	"""
 
+	raise NotImplementedError
+
 	if not ids:
 		return None
 
@@ -271,24 +318,6 @@ def get_database_posts(db, datasource, ids, board="", threads=False, limit=0, of
 
 	return posts
 
-def has_datasource_template(datasource: str) -> bool:
-	"""
-	Check if the data source has a data source-specific template.
-	This requires HTML and CSS files.
-	Custom HTML files should be placed in `webtool/templates/explorer/datasource-templates/<datasource name>.html`.
-	Custom CSS files should be placed in `webtool/static/css/explorer/<datasource name>.css`.
-
-	:param datasource:	Datasource name.
-
-	:returns: bool, Whether the required files are present.
-	"""
-	css_exists = Path(config.get('PATH_ROOT'), "webtool/static/css/explorer/" + datasource + ".css").exists()
-	html_exists = Path(config.get('PATH_ROOT'), "webtool/templates/explorer/datasource-templates/" + datasource + ".html").exists()
-
-	if css_exists and html_exists:
-		return True
-	return False
-
 @app.route('/results/<datasource>/<string:thread_id>/explorer')
 @api_ratelimit
 @login_required
@@ -306,6 +335,7 @@ def explorer_api_thread(datasource, thread_id):
 
 	:return-error 404:  If the thread ID does not exist for the given data source.
 	"""
+	raise NotImplementedError
 
 	if not datasource:
 		return error(404, error="No datasource provided")
@@ -346,6 +376,7 @@ def explorer_api_posts(datasource, post_ids):
 
 	:return-error 404:  If the thread ID does not exist for the given data source.
 	"""
+	raise NotImplementedError
 
 	if not datasource:
 		return error(404, error="No datasource provided")

From e78099f859bda42206289e0a2210210d6d3645ad Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Tue, 20 Aug 2024 11:31:57 +0200
Subject: [PATCH 135/204] Test annotation processor

---
 .../metrics/annotation_processor_test.py      | 45 +++++++++++++++++++
 1 file changed, 45 insertions(+)
 create mode 100644 processors/metrics/annotation_processor_test.py

diff --git a/processors/metrics/annotation_processor_test.py b/processors/metrics/annotation_processor_test.py
new file mode 100644
index 000000000..86f6f3264
--- /dev/null
+++ b/processors/metrics/annotation_processor_test.py
@@ -0,0 +1,45 @@
+"""
+Collapse post bodies into one long string
+"""
+
+from common.lib.helpers import UserInput
+from backend.lib.processor import BasicProcessor
+
+
+class AnnotatePosts(BasicProcessor):
+	"""
+	Merge post body into one long string
+	"""
+	type = "annotate-posts"  # job type ID
+	category = "Metrics"  # category
+	title = "Annotation test"  # title displayed in UI
+	description = "Ya know"  # description displayed in UI
+	extension = "csv"  # extension of result file, used internally and in UI
+
+	options = {
+		"overwrite": {
+			"type": UserInput.OPTION_TOGGLE,
+			"default": False,
+			"help": "Overwrite existing annotations by this processor?"
+		},
+		"field_label": {
+			"type": UserInput.OPTION_TEXT,
+			"default": ""
+		}
+	}
+
+	def process(self):
+		import random
+		annotations = []
+		with self.dataset.get_results_path().open("w") as results:
+
+			for post in self.source_dataset.iterate_items(self):
+
+				annotation = {"item_id": post["id"],
+							  "label": self.parameters.get("field_label", ""),
+							  "value": random.randrange(1, 1000000)}
+
+				annotations.append(annotation)
+
+		self.write_annotations(annotations, overwrite=self.parameters.get("overwrite", False))
+		self.dataset.finish(1)
\ No newline at end of file

From 90a0eb0ddf755c433802b27ab24664d78c298244 Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Tue, 20 Aug 2024 12:55:10 +0200
Subject: [PATCH 136/204] Improve Tumblr search description

---
 datasources/tumblr/DESCRIPTION.md | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/datasources/tumblr/DESCRIPTION.md b/datasources/tumblr/DESCRIPTION.md
index 8269204a1..5100cb47f 100644
--- a/datasources/tumblr/DESCRIPTION.md
+++ b/datasources/tumblr/DESCRIPTION.md
@@ -1,5 +1,5 @@
 The Tumblr data is retrieved by interfacing with the [Tumblr API](https://api.tumblr.com).
-It is only possible to get posts by tag or per blog, since the API does not allow keyword search.
+It is only possible to get posts by tag, per blog, or by individual posts, since the API does not allow keyword search.
 
 ### Privacy
 Be aware that the data may contain personal information. It is thus recommended to pseudonymise the data.
@@ -14,9 +14,8 @@ may request a rate limit increase via Tumblr.
 If no internal API key is set, you can insert your own.
 
 ### Date bugs
-The [Tumblr API](https://api.tumblr.com) is volatile: when fetching sporadically used 
-tags, it may return zero posts, even though older posts *do* exist. Check the oldest post in 
-your dataset to see if it this is indeed the case and whether any odd time gaps exists.
+The [Tumblr API](https://api.tumblr.com) is volatile: when fetching content, it may return zero posts, even though older posts *do* exist. Check the oldest post in 
+your dataset to see if this is indeed the case and whether any odd time gaps exist.
 4CAT tries to mitigate this by decreasing the date parameter (<code>before</code>) with
 six hours and sending the query again. This often successfully returns older, un-fetched posts.
 If it didn't find new data after checking 24 days in the past, it checks for data up to six years

From 288dc1af0d8a7225bd610fc434b2a46c1273c8d8 Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Tue, 20 Aug 2024 12:55:42 +0200
Subject: [PATCH 137/204] Convert timestamps to the client's local time zone in
 Explorer

---
 webtool/static/js/explorer.js                    | 6 ++++++
 webtool/templates/explorer/post-annotations.html | 2 +-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/webtool/static/js/explorer.js b/webtool/static/js/explorer.js
index 10c69423b..992465afe 100644
--- a/webtool/static/js/explorer.js
+++ b/webtool/static/js/explorer.js
@@ -864,6 +864,12 @@ const page_functions = {
 			document.querySelectorAll('.thread li').forEach(link => link.classList.remove('highlight'));
 		}));
 
+		// Change timestamps to the client's timezone
+		document.querySelectorAll(".timestamp-to-convert").forEach(function(el){
+			let local_date = new Date(parseInt(el.innerText) * 1000)
+			el.innerText = new Intl.DateTimeFormat("en-GB", {dateStyle: "medium", timeStyle: "medium"}).format(local_date)
+		});
+
 		// Reorder the dataset when the sort type is changed
 		$(".sort-select").on("change", function(){
 
diff --git a/webtool/templates/explorer/post-annotations.html b/webtool/templates/explorer/post-annotations.html
index f439a3dd3..6112397c7 100644
--- a/webtool/templates/explorer/post-annotations.html
+++ b/webtool/templates/explorer/post-annotations.html
@@ -55,7 +55,7 @@
                 <button class="tooltip-trigger" aria-controls="tooltip-annotation-metadata-{{ post.id }}-{{ annotation.field_id }}">?</button>
                 <p role="tooltip" id="tooltip-annotation-metadata-{{ post.id }}-{{ annotation.field_id }}" aria-hidden="true">
                 {% if annotation.author or annotation.timestamp %}
-                    Last edited by{% if annotation.by_processor %} processor{% endif %} {% if annotation.author %}<strong>{{ annotation.author }}</strong>{% endif %}{% if annotation.timestamp %} on {{ annotation.timestamp |  datetime(fmt="%d %B, %H:%M", wrap=False) }}{% endif %}
+                    Last edited by{% if annotation.by_processor %} processor{% endif %} {% if annotation.author %}<strong>{{ annotation.author }}</strong>{% endif %}{% if annotation.timestamp %} on <span class="timestamp-to-convert">{{ annotation.timestamp }}</span>{% endif %}
                 {% endif %}
                 {# Extra stuff we may want to display #}
                 {% if annotation.metadata %}

From a6230aa76faed96a6f39878286cd676ee4c42a67 Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Tue, 20 Aug 2024 16:35:03 +0200
Subject: [PATCH 138/204] Add processor that lets you download annotation
 metadata

---
 processors/metrics/annotation_metadata.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 processors/metrics/annotation_metadata.py

diff --git a/processors/metrics/annotation_metadata.py b/processors/metrics/annotation_metadata.py
new file mode 100644
index 000000000..e69de29bb

From 1a31d60b79ad07968625ff4e501943262094ec30 Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Tue, 20 Aug 2024 17:57:31 +0200
Subject: [PATCH 139/204] Add annotation metadata processor

---
 common/lib/dataset.py                     | 16 ++++++---
 processors/metrics/annotation_metadata.py | 40 +++++++++++++++++++++++
 2 files changed, 51 insertions(+), 5 deletions(-)

diff --git a/common/lib/dataset.py b/common/lib/dataset.py
index 56f92601f..5dc1fa843 100644
--- a/common/lib/dataset.py
+++ b/common/lib/dataset.py
@@ -333,14 +333,12 @@ def iterate_items(self, processor=None, warn_unmappable=True, map_missing="defau
 		if own_processor and own_processor.map_item_method_available(dataset=self):
 			item_mapper = True
 
-		# Annotations are dynamically added
-		# and we're handling them as 'extra' map_item fields.
-		has_annotations = self.has_annotations()
+		# Annotations are dynamically added and we're handling them as 'extra' map_item fields.
 		annotation_labels = self.get_annotation_field_labels()
 
 		# missing field strategy can be for all fields at once, or per field
 		# if it is per field, it is a dictionary with field names and their strategy
-		# if it is for all fields, it is may be a callback, 'abort', or 'default'
+		# if it is for all fields, it may be a callback, 'abort', or 'default'
 		default_strategy = "default"
 		if type(map_missing) is not dict:
 			default_strategy = map_missing
@@ -382,7 +380,7 @@ def iterate_items(self, processor=None, warn_unmappable=True, map_missing="defau
 				mapped_item = original_item
 
 			# Add possible annotations
-			if has_annotations:
+			if annotation_labels:
 
 				# Get annotations for this specific post
 				post_annotations = self.get_annotations(item_id=mapped_item.data["id"])
@@ -1826,6 +1824,14 @@ def save_annotation_fields(self, new_fields: dict, add=False) -> int:
 
 		return len(new_fields)
 
+	def get_annotation_metadata(self) -> dict:
+		"""
+		Retrieves all the data for this dataset from the annotations table.
+		"""
+
+		annotation_data = self.db.fetchall("SELECT * FROM annotations WHERE dataset = '%s';" % self.key)
+		return annotation_data
+
 	def __getattr__(self, attr):
 		"""
 		Getter so we don't have to use .data all the time
diff --git a/processors/metrics/annotation_metadata.py b/processors/metrics/annotation_metadata.py
index e69de29bb..93b110e35 100644
--- a/processors/metrics/annotation_metadata.py
+++ b/processors/metrics/annotation_metadata.py
@@ -0,0 +1,40 @@
+"""
+Retrieves metadata on annotations for this dataset.
+"""
+
+from backend.lib.processor import BasicProcessor
+
+from datetime import datetime
+
+class AnnotationMetadata(BasicProcessor):
+	"""
+	Download annotation metadata from this dataset
+	"""
+	type = "annotation-metadata"  # job type ID
+	category = "Annotations"  # category
+	title = "Annotation metadata"  # title displayed in UI
+	description =	"Download all the metadata about annotations for this dataset. " \
+					"Includes information like who made the annotation, when it was " \
+					"last edited, etcetera." # description displayed in UI
+	extension = 'csv'  # extension of result file, used internally and in UI
+
+	def process(self):
+
+		annotation_metadata = self.source_dataset.get_annotation_metadata()
+
+		if not annotation_metadata:
+			self.dataset.finish_with_error("No annotations made for this dataset")
+
+		for row in annotation_metadata:
+			timestamp = row["timestamp"]
+			timestamp_created = row["timestamp_created"]
+			row["timestamp"] = self.to_date_str(timestamp)
+			row["epoch_timestamp"] = timestamp
+			row["timestamp_created"] = self.to_date_str(timestamp_created)
+			row["epoch_timestamp_created"] = timestamp_created
+
+		self.write_csv_items_and_finish(annotation_metadata)
+
+	@staticmethod
+	def to_date_str(epoch_timestamp) -> str:
+		return datetime.strftime(datetime.utcfromtimestamp(int(epoch_timestamp)), "%Y-%m-%d %H:%M:%S")
\ No newline at end of file

From 62866913440e0314ecf5750355097beae78c3915 Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Tue, 20 Aug 2024 17:58:03 +0200
Subject: [PATCH 140/204] Add `author_original` to Annotation() attributes

---
 backend/database.sql                        |  1 +
 backend/lib/processor.py                    | 11 ++++-------
 common/lib/annotation.py                    |  4 +++-
 helper-scripts/migrate/migrate-1.45-1.46.py |  1 +
 4 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/backend/database.sql b/backend/database.sql
index 01e124eaa..142cc8c86 100644
--- a/backend/database.sql
+++ b/backend/database.sql
@@ -80,6 +80,7 @@ CREATE TABLE IF NOT EXISTS annotations (
   options           TEXT,
   value             TEXT,
   author            TEXT,
+  author_original   TEXT,
   by_processor      BOOLEAN DEFAULT FALSE,
   metadata          TEXT
 );
diff --git a/backend/lib/processor.py b/backend/lib/processor.py
index 47dbcf3c9..e5328b1fc 100644
--- a/backend/lib/processor.py
+++ b/backend/lib/processor.py
@@ -19,7 +19,7 @@
 from common.lib.fourcat_module import FourcatModule
 from common.lib.helpers import get_software_commit, remove_nuls, send_email, hash_values
 from common.lib.exceptions import (WorkerInterruptedException, ProcessorInterruptedException, ProcessorException,
-								   DataSetException, MapItemException)
+								   DataSetException, MapItemException, AnnotationException)
 from common.config_manager import config, ConfigWrapper
 
 
@@ -757,12 +757,13 @@ def write_annotations(self, annotations: list, source_dataset=None, overwrite=Fa
 				annotation["label"] = label
 			elif annotation.get("label") and not overwrite:
 				if annotation["label"] in existing_labels:
-					already_exists_error = annotation["label"]
-					break
+					raise AnnotationException("Annotation label '%s' already exists for this dataset" % annotation["label"])
 
 			# Set the author to this processor's name
 			if not annotation.get("author"):
 				annotation["author"] = self.name
+			if not annotation.get("author_original"):
+				annotation["author_original"] = self.name
 
 			annotation["by_processor"] = True
 
@@ -771,10 +772,6 @@ def write_annotations(self, annotations: list, source_dataset=None, overwrite=Fa
 				annotation["metadata"] = {}
 			annotation["metadata"]["processor-parameters"] = self.parameters
 
-		if already_exists_error:
-			self.dataset.finish_with_error(
-				"Annotation label '%s' already exists for this dataset" % already_exists_error)
-
 		annotations_saved = source_dataset.save_annotations(annotations, overwrite=overwrite)
 		return annotations_saved
 
diff --git a/common/lib/annotation.py b/common/lib/annotation.py
index c12c53eeb..89aa95e56 100644
--- a/common/lib/annotation.py
+++ b/common/lib/annotation.py
@@ -33,7 +33,8 @@ class Annotation:
     type = None               # Type of annotation (e.g. `text`)
     options = None            # Possible options
     value = None              # The actual annotation value
-    author = None             # Who made the annotation
+    author = None             # Who last edited the annotation
+    author_original = None    # Who originally made the annotation
     by_processor = None       # Whether the annotation was made by a processor
     metadata = None           # Misc metadata
 
@@ -115,6 +116,7 @@ def __init__(self, data=None, id=None, db=None):
                 "options": data.get("options", ""),
                 "value": data.get("value", ""),
                 "author": data.get("author", ""),
+                "author_original": data.get("author_original", ""),
                 "by_processor": data.get("by_processor", False),
                 "metadata": data.get("metadata", {}),
             }
diff --git a/helper-scripts/migrate/migrate-1.45-1.46.py b/helper-scripts/migrate/migrate-1.45-1.46.py
index d45523716..719507098 100644
--- a/helper-scripts/migrate/migrate-1.45-1.46.py
+++ b/helper-scripts/migrate/migrate-1.45-1.46.py
@@ -65,6 +65,7 @@
   options           TEXT,
   value             TEXT,
   author            TEXT,
+  author_original   TEXT,
   by_processor      BOOLEAN DEFAULT FALSE,
   metadata          TEXT
 );

From 79e866108760c63707021aa6a375739c44e3661a Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Tue, 20 Aug 2024 17:58:19 +0200
Subject: [PATCH 141/204] 'Fix' dummy processor

---
 .../metrics/annotation_processor_test.py      | 22 +++++++++++--------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/processors/metrics/annotation_processor_test.py b/processors/metrics/annotation_processor_test.py
index 86f6f3264..5b7d19ac5 100644
--- a/processors/metrics/annotation_processor_test.py
+++ b/processors/metrics/annotation_processor_test.py
@@ -5,6 +5,7 @@
 from common.lib.helpers import UserInput
 from backend.lib.processor import BasicProcessor
 
+from common.lib.exceptions import AnnotationException
 
 class AnnotatePosts(BasicProcessor):
 	"""
@@ -31,15 +32,18 @@ class AnnotatePosts(BasicProcessor):
 	def process(self):
 		import random
 		annotations = []
-		with self.dataset.get_results_path().open("w") as results:
+		try:
+			with self.dataset.get_results_path().open("w") as results:
 
-			for post in self.source_dataset.iterate_items(self):
+				for post in self.source_dataset.iterate_items(self):
 
-				annotation = {"item_id": post["id"],
-							  "label": self.parameters.get("field_label", ""),
-							  "value": random.randrange(1, 1000000)}
+						annotation = {"item_id": post["id"],
+									  "label": self.parameters.get("field_label", ""),
+									  "value": random.randrange(1, 1000000)}
+						annotations.append(annotation)
 
-				annotations.append(annotation)
-
-		self.write_annotations(annotations, overwrite=self.parameters.get("overwrite", False))
-		self.dataset.finish(1)
\ No newline at end of file
+			if annotations:
+				self.write_annotations(annotations, overwrite=self.parameters.get("overwrite", False))
+				self.dataset.finish(1)
+		except AnnotationException as e:
+			self.dataset.finish_with_error(str(e))
\ No newline at end of file

From a43ed906a5623b83b1c21a9b2de6e0d53117ca15 Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Tue, 20 Aug 2024 17:58:31 +0200
Subject: [PATCH 142/204] Make lines separable in tooltip

---
 webtool/static/css/stylesheet.css | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/webtool/static/css/stylesheet.css b/webtool/static/css/stylesheet.css
index 8c928e78e..70a11ba92 100644
--- a/webtool/static/css/stylesheet.css
+++ b/webtool/static/css/stylesheet.css
@@ -493,6 +493,10 @@ button.tooltip-trigger {
     border-color: var(--gray-dark);
 }
 
+p[role=tooltip] > .tooltip-line {
+    display: inline-block;
+}
+
 
 /** --------------------- *
           Tab containers

From e3789382236599fc1a4719886d757f9b1305edcf Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Tue, 20 Aug 2024 17:59:01 +0200
Subject: [PATCH 143/204] Remove a humongous amount of code in explorer.js by
 simply refreshing the page

---
 webtool/static/js/explorer.js | 380 ++++++++--------------------------
 1 file changed, 83 insertions(+), 297 deletions(-)

diff --git a/webtool/static/js/explorer.js b/webtool/static/js/explorer.js
index 992465afe..066266682 100644
--- a/webtool/static/js/explorer.js
+++ b/webtool/static/js/explorer.js
@@ -43,8 +43,8 @@ const annotations = {
 				$("#toggle-annotation-fields").html("<i class='fas fa-eye-slash'></i> Hide editor");
 				toggle_fields.addClass("shown");
 				// Bit convoluted, but necessary to restore auto height
-				current_height = editor.height();
-				auto_height = editor.css("height", "auto").height();
+				let current_height = editor.height();
+				let auto_height = editor.css("height", "auto").height();
 				editor.height(current_height).animate({"height": auto_height}, 250, function(){
 					editor.height("auto");
 				});
@@ -109,7 +109,12 @@ const annotations = {
 		post_annotations.on("click", "option, input[type=checkbox], label", function() { annotations.enableSaving(); edits_made = true;});
 
 		// Keep track of whether the annotations are edited or not.
-		post_annotations.on("keydown change", ".post-annotation-input, .post-annotation input, .post-annotation textarea", function(){$(this).addClass("edited")});
+		post_annotations.on("keydown change",
+							".post-annotation-input, .post-annotation input, .post-annotation textarea",
+							function(){
+			annotations.markChanges($(this).parent());
+
+		});
 
 		// Save the annotations to the database
 		$("#save-annotations").on("click", function(){
@@ -167,7 +172,7 @@ const annotations = {
 				no_empty_fields = false;
 			}
 		input_fields.each(function(){
-			var input_field = $(this).find("input");
+			let input_field = $(this).find("input");
 			let val = input_field.val();
 
 			if (!val.length > 0) {
@@ -192,7 +197,7 @@ const annotations = {
 				count++;
 
 				// Don't add a delete option for the last (empty) input.
-				if (count == amount) {
+				if (count === amount) {
 					return false;
 				}
 				$(this).append(`
@@ -206,12 +211,12 @@ const annotations = {
 		$(el).parent().remove();
 
 		// Make sure you can't delete the last element
-		if (input_fields.find(".option-field").length == 1) {
+		if (input_fields.find(".option-field").length === 1) {
 			input_fields.find(".delete-option-field").remove();
 		}
 	},
 
-	parseAnnotationFields: function (e) {
+	parseAnnotationFields: function () {
 		/*
 		Validates and converts the fields in the annotations editor.
 		Returns an object with the set annotation fields.
@@ -271,11 +276,10 @@ const annotations = {
 				let options = []; // List of dicts, because it needs to be ordered
 				let option_labels = [];
 				let no_options_added = true;
-				let option_id = ""
 
 				options_field.find(".option-field > input").each(function(){
 					let option_label = $(this).val();
-					let option_id = this.id.replace("input-", "");
+					let option_id = $(this).id.replace("input-", "");
 
 					if (!option_labels.includes(option_label) && option_label.length > 0) {
 
@@ -321,39 +325,42 @@ const annotations = {
 		Converts the DOM objects of an annotation field
 		to an annotation Object.
 
-		Must be given an input field element
+		Must be given a .post-annotation div element
 
 		*/
-
-		let ann = $(el)
-		let field_id = ann.attr("class").split(" ")[1].replace("field-", "");
-		let annotation_type = ann.attr("class").split(" ")[2].replace("type-", "");
-		let item_id = ann.attr("class").split(" ")[3].replace("item-id-", "");
-		let author = "Jan"
-		let label = ann.find(".annotation-label").text();
+		console.log(el)
+		let ann_input = el.find(".post-annotation-input");
+		let ann_classes = el.attr("class").split(" ");
+		let ann_input_classes = ann_input.attr("class").split(" ");
+		let field_id = ann_input_classes[1].replace("field-", "");
+		let annotation_type = ann_classes[2].replace("type-", "");
+		let item_id = ann_classes[3].replace("item-id-", "");
+		let label = el.find(".annotation-label").text();
+		let author = el.find(".annotation-author").html();
+		let timestamp = parseInt(el.find(".epoch-timestamp-edited").html());
 
 		let val = undefined;
 		let edited = false
 
 		if (annotation_type === "text" || annotation_type === "textarea") {
-			val = ann.find(".post-annotation-input").val();
+			val = ann_input.val();
 			// It can be the case that the input text is deleted
 			// In this case we *do* want to push new data, so we check
 			// whether there's an 'edited' class present and save if so.
-			if (ann.find(".post-annotation-input").hasClass("edited")) {
+			if (ann_input.hasClass("edited")) {
 				edited = true
 			}
 		}
 		else if (annotation_type === "dropdown") {
-			val = ann.find(".post-annotation-options").val();
+			val = ann_input.find(".post-annotation-options").val();
 		}
 		else if (annotation_type === "checkbox") {
 			val = [];
-			ann.find(".post-annotation-options > input").each(function(){
-				if (ann.is(":checked")) {
-					val.push(ann.val());
+			ann_input.find(".post-annotation-options > input").each(function(){
+				if (ann_input.is(":checked")) {
+					val.push(ann_input.val());
 				}
-				if (ann.hasClass("edited")) {
+				if (ann_input.hasClass("edited")) {
 					edited = true
 				}
 			});
@@ -361,16 +368,17 @@ const annotations = {
 				val = undefined;
 			}
 		}
-		/*if ((val !== undefined && val !== "") || edited) {
-			vals_changed = true;
-			val = "";
-			console.log("EDITED")
-		}*/
-
-		/*if (vals_changed){
-			annotation[post_id] = post_vals;
-		}
-*/
+
+		// if ((val !== undefined && val !== "") || edited) {
+		// 	vals_changed = true;
+		// 	val = "";
+		// 	console.log("EDITED")
+		// }
+		//
+		// if (vals_changed){
+		// 	annotation[post_id] = post_vals;
+		// }
+
 		// Create an annotation object and add them to the array.
 		let annotation = {
 			"field_id": field_id,
@@ -379,7 +387,8 @@ const annotations = {
 			"type": annotation_type,
 			"value": val,
 			"author": author,
-			"by_processor": false // Explorer annotations are human-made!
+			"by_processor": false, // Explorer annotations are human-made!
+			"timestamp": timestamp
 		}
 		console.log(annotation)
 		return annotation
@@ -395,7 +404,6 @@ const annotations = {
 		// Show an error message if the annotation fields were not valid.
 		if (typeof annotation_fields == "string") {
 			annotations.warnEditor(annotation_fields);
-			return
 		}
 
 		// If everything is ok, we're going to add
@@ -414,247 +422,9 @@ const annotations = {
 			});
 
 			// We store the annotation fields in the dataset table.
-			annotations.saveAnnotationFields(annotation_fields)
-
-			// Get the ID (stored as class) of fields we've already added (could be none)
-			var added_fields = [];
-			$(".posts li").first().find(".post-annotation").each(function(){
-				cls = this.className.split(" ")[1];
-				if (!added_fields.includes(cls)){
-					added_fields.push(cls);
-				}
-			});
-
-			// Add input fields to every post in the explorer.
-			// We take the annotations of the first post to check
-			// what's the current state and add them to every post after.
-			let text_fields = ["textarea", "text"];
-
-			// Loop through all the annotation fields
-			for (var field in annotation_fields) {
-
-				// Get some variables
-				let input_type = annotation_fields[field].type;
-				let input_label = annotation_fields[field].label;
-				let input_id = "field-" + field;
-				let class_id = "." + input_id;
-
-				// We first have to check whether this annotation field was already added.
-				// If this is the case, we're either going to add or convert the fields.
-				if (added_fields.includes(input_id)) {
-					
-					// Edit the labels if they have changed.
-					label_span = $(class_id + " > .annotation-label");
-					label = label_span.first().text();
-					if (label !== input_label) {
-						label_span.each(function(){
-							$(this).text(input_label);
-						});
-					}
-
-					// If the type of input field has changed,
-					// we'll convert the data where possible.
-					// Last class is the input type
-					let old_input_type = $(class_id).first().attr('class').split(' ').at(-1); 
-
-					// If the change is between a textbox and textarea,
-					// change the input type and carry over the text.
-					if (input_type !== old_input_type) {
-
-						if (text_fields.includes(input_type) && text_fields.includes(old_input_type)) {
-							
-							$(class_id + " > .post-annotation-input").each(function(){
-
-								// Get the old inserted text, if it's present
-								let add_val = "";
-								if ($(this).val().length > 0 && $(this).val() != undefined ){
-									add_val = $(this).val();
-								}
-
-								// Replace the HTML element, insert old values, and change the type class
-								if (input_type === "text" && old_input_type === "textarea") {
-									$(this).parent().removeClass("textarea").addClass("text");
-									$(this).replaceWith($("<input type='text' class='post-annotation-input text-" + field + "'>").val(add_val));
-								}
-								else if (input_type === "textarea" && old_input_type === "text") {
-									$(this).parent().removeClass("text").addClass("textarea");
-									$(this).replaceWith($("<textarea class='post-annotation-input textarea-" + field + "'>" + add_val + "</textarea>"));
-								}
-							});
-						}
-
-						// We don't don't convert for changes between checkboxes and dropdowns
-						// or between a text input and dropdowns or checkboxes.
-						// Simply replace the elements. Old data will be lost.
-						else {
-							$(class_id).remove();
-							fields_to_add[field] = annotation_fields[field];
-						}
-					}
-
-					// For dropdowns and checkboxes, we're checking whether we 
-					// have to add or change any of their options.
-					else if (input_type === "checkbox" || input_type === "dropdown"){
-
-						let options = annotation_fields[field].options;
-						let valid_options = [];
-						let option_list = $(class_id).find(".post-annotation-options");
-
-						// Let's take the first post's options as a check.
-						let existing_options = $(class_id).first();
-
-						for (let i in options) {
-							
-							for (let option_id in annotation_fields[field]["options"][i]) {
-								
-								//let option_id = annotation_fields[field]["options"][i][option_id];
-								let existing_option = existing_options.find(".option-" + option_id);
-								let old_label = existing_option.first().val();
-								let new_label = annotation_fields[field]["options"][i][option_id];
-								
-								// If this field does not exist yet, add it
-								if (!existing_option.length) {
-									
-									option_list.each(function(){
-										// We need a unique ID for the posts's specific option element.
-										// Else it gets messy with elements across posts.
-										let post_id = $(this).parents("li").attr("id").split("post-")[1];
-										post_option_id = post_id + "-" + option_id;
-
-										if (input_type === "dropdown") {
-											$(this).append("<option class='post-annotation-input option-" + option_id + "' id='option-" + post_option_id + "' value='" + new_label + "'>" + new_label + "</option>");
-										}
-										else if (input_type === "checkbox") {
-											$(this).append("<input class='post-annotation-input option-" + option_id + "' id='option-" + post_option_id + "' value='" + new_label + "' type='checkbox'><label for='option-" + post_option_id + "'>" + new_label + "</label>");
-										}
-									});
-								}
-
-								// Change the option labels if they have been edited
-								else if (old_label != new_label) {
-									$(class_id).find(".option-" + option_id).each(function(){
-										$(this).val(new_label).text(new_label);
-										$(this).next("label").text(new_label);	
-									});
-								}
-								valid_options.push("option-" + option_id);
-							}
-						}
-
-						// Delete any fields that were removed from the checkbox/dropdown.
-						let present_options = [];
-						option_list.first().find(".post-annotation-input").each(function(){
-							if ((this.id).length > 0) {
-								present_options.push(this.className.replace(" edited", "").split(" ").at(-1));
-							}
-						});
-
-						for (let z in present_options) {
-							if (!valid_options.includes(present_options[z])){
-								let remove_input = $(class_id).find("." + present_options[z]);
-								remove_input.next("label").remove();
-								remove_input.remove();
-							}
-						}
-					}
-				}
-				
-				// If this annotation has not been added yet, do so now.
-				else {
-					fields_to_add[field] = annotation_fields[field];
-				}
-			}
-
-			// Else we're adding them
-			for (var add_field in fields_to_add) {
-
-				// Get some variables
-				let input_type = fields_to_add[add_field].type;
-				let input_id = "field-" + add_field;
-				let input_label = fields_to_add[add_field].label
-
-				// Add a label for the field
-				el = "<div class='post-annotation " + input_id + " " + input_type + "'><label class='annotation-label' for='" + add_field + "{POST_ID}'>" + input_label + "</label>";
-
-				// Add a text input for text fields
-				if (input_type === "text") {
-					el += "<input type='text' class='post-annotation-input text-" + add_field + "'>";
-				}
-				else if (input_type === "textarea") {
-					el += "<textarea class='post-annotation-input textarea-" + add_field + "'></textarea>";
-				}
-
-				// Add a dropdown for dropdown fields
-				else if (input_type === "dropdown") {
-
-					el += "<select class='post-annotation-options select-" + add_field + "' id='options-" + add_field + "-{POST_ID}'>";
-					
-					// Add an empty option field first
-					el += "<option class='post-annotation-input'></option>";
-
-					let options = fields_to_add[add_field].options;
-					let option_id = "";
-
-					for (let i in options) {
-						for (let option_id in options[i]) {
-							option_label = options[i][option_id];
-						}
-						el += "<option class='post-annotation-input option-" + option_id + "' id='option-" + option_id + "' value='" + option_label + "'>" + option_label + "</option>";
-					}
-					el += "</select>";
-				}
-
-				// Add checkboxes for checkbox fields
-				else if (input_type === "checkbox") {
-
-					el += "<div class='post-annotation-options checkboxes-" + add_field + "'>";
-					let options = fields_to_add[add_field].options;
-
-					for (let i in options) {
-
-						for (let option_id in options[i]) {
-							
-							option_label = options[i][option_id];
-						
-							el += "<input type='checkbox' class='post-annotation-input option-" + option_id + "' id='option-{POST_ID}-" + option_id + "' value='" + option_label + "'><label for='option-{POST_ID}-" + option_id + "'>" + option_label + "</label>";
-						}
-					}
-					el += "</div>";
-				}
-				el += "</div>";
-				$(".posts li").each(function(){
-					let post_id = this.id.split("post-")[1];
-					$(this).find(".post-annotations").append(el.replaceAll("{POST_ID}", post_id));
-				});
-			}
-		}
-		
-		// Remove annotation forms that are deleted
-		var valid_fields = [];
-		for (var f in annotation_fields) {
-			valid_fields.push("field-" + f);
-		}
-		var present_annotations = $(".post-annotations").first().find(".post-annotation")
-		present_annotations.each(function(){
-			let present_id = $(this).attr("class").split(" ")[1];
-			if (!valid_fields.includes(present_id)) {
-				$("." + present_id).remove();
-			}
-		});
-		
-		// Hide annotations if there's no fields leftover
-		var leftover_annotations = $(".post-annotations").first().find(".post-annotation");
-		if (leftover_annotations.length < 1) {
-			annotations.hideAnnotations();
-			$("#toggle-annotations").addClass("disabled");
+			annotations.saveAnnotationFields(annotation_fields);
+			location.reload();
 		}
-		// Else we're showing 'em
-		else {
-			annotations.showAnnotations();
-			$("#toggle-annotations").removeClass("disabled");
-		}
-
-		$("#apply-annotation-fields").html("<i class='fas fa-check'></i> Apply")
 	},
 
 	saveAnnotationFields: function (annotation_fields){
@@ -695,7 +465,7 @@ const annotations = {
 		});
 	},
 
-	saveAnnotations: function (e){
+	saveAnnotations: function (){
 		// Write the annotations to the dataset and annotations table.
 
 		// First we're going to collect the data for this page.
@@ -705,7 +475,6 @@ const annotations = {
 
 		$(".posts > li").each(function(){
 
-			let vals_changed = false;
 			let post_annotations = $(this).find(".post-annotations");
 
 			if (post_annotations.length > 0) {
@@ -713,7 +482,7 @@ const annotations = {
 				post_annotations.find(".post-annotation").each(function(){
 					
 					// Extract annotation object from the element
-					let annotation = annotations.parseAnnotation(this);
+					let annotation = annotations.parseAnnotation($(this));
 
 					if (annotation) {
 						anns.push(annotation);
@@ -721,11 +490,13 @@ const annotations = {
 				});
 			}
 		})
-		
-		$("#save-annotations").html("<i class='fas fa-circle-notch spinner'></i> Saving annotations")
+
+		let save_annotations = $("#save-annotations");
+		save_annotations.html("<i class='fas fa-circle-notch spinner'></i> Saving annotations")
 		annotations.disableSaving();
 
 		let code = ""
+
 		$.ajax({
 			url: getRelativeURL("explorer/save_annotations/" + dataset_key),
 			type: "POST",
@@ -738,24 +509,24 @@ const annotations = {
 					code = response
 
 					annotations.enableSaving();
-					$("#save-annotations").html("<i class='fas fa-save'></i> Annotations saved");
-					$("#save-annotations").addClass("disabled");
-					old_annotation_fields = $("#annotation-field").each();
+					save_annotations.html("<i class='fas fa-save'></i> Annotations saved");
+					save_annotations.addClass("disabled");
+					//var old_annotation_fields = $("#annotation-field").each();
 					// alert(alert_message);
 				}
 				else {
 					annotations.enableSaving();
-					$("#save-annotations").html("<i class='fas fa-save'></i> Save annotations");
-					alert("Could't save annotations");
-					$("#save-annotations").removeClass("disabled");
+					save_annotations.html("<i class='fas fa-save'></i> Save annotations");
+					alert("Couldn't save annotations");
+					save_annotations.removeClass("disabled");
 					console.log(response);
 				}
 			},
 			error: function (error) {
 				annotations.enableSaving();
-				$("#save-annotations").html("<i class='fas fa-save'></i> Save annotations");
-				$("#save-annotations").removeClass("disabled");
-				//alert("Could't save annotations");
+				save_annotations.html("<i class='fas fa-save'></i> Save annotations");
+				save_annotations.removeClass("disabled");
+				//alert("Couldn't save annotations");
 				console.log(error)
 			}
 		});
@@ -804,8 +575,8 @@ const annotations = {
 		ta.html("<i class='fas fa-eye-slash'></i> Hide annotations");
 		// Bit convoluted, but necessary to have auto height
 		let pa = $(".post-annotations");
-		current_height = pa.height();
-		auto_height = pa.css("height", "auto").height();
+		let current_height = pa.height();
+		let auto_height = pa.css("height", "auto").height();
 		pa.height(current_height).animate({"height": auto_height}, 250, function(){
 			pa.height("auto");
 		});
@@ -852,28 +623,38 @@ const annotations = {
 		}
 		return "<div class='option-field'><input type='text' id='input-" + id + "' placeholder='Value'></div>";
 	},
+
+	markChanges: function(el) {
+		// Adds current changes to a post annotation so we can save these later.
+		// Currently includes the time of edits and the username of the annotator
+		let current_username = $("#current-username").html();
+		let current_date = Date.now() / 1000;
+		let input_field = el.find(".post-annotation-input");
+		input_field.addClass("edited");
+		$(el).find(".annotation-author").html(current_username);
+		$(el).find(".epoch-timestamp-edited").html(current_date);
+		$(el).find(".timestamp-edited").html(getLocalTimeStr(current_date));
+	}
 };
 
 const page_functions = {
 	init: function() {
-		document.querySelectorAll('.quote a').forEach(link => link.addEventListener('mouseover', function(e) {
+		document.querySelectorAll('.quote a').forEach(link => link.addEventListener('mouseover', function() {
 			let post = 'post-' + this.getAttribute('href').split('-').pop();
 			document.querySelector('#' + post).classList.add('highlight');
 		}));
-		document.querySelectorAll('.quote a').forEach(link => link.addEventListener('mouseout', function(e) {
+		document.querySelectorAll('.quote a').forEach(link => link.addEventListener('mouseout', function() {
 			document.querySelectorAll('.thread li').forEach(link => link.classList.remove('highlight'));
 		}));
 
 		// Change timestamps to the client's timezone
 		document.querySelectorAll(".timestamp-to-convert").forEach(function(el){
-			let local_date = new Date(parseInt(el.innerText) * 1000)
-			el.innerText = new Intl.DateTimeFormat("en-GB", {dateStyle: "medium", timeStyle: "medium"}).format(local_date)
+			el.innerText = getLocalTimeStr(el.innerText);
 		});
 
 		// Reorder the dataset when the sort type is changed
 		$(".sort-select").on("change", function(){
 
-
 			// Get the column to sort on, an whether we should sort in reverse.
 			let selected = $("#column-sort-select").find("option:selected").val();
 			let order = $("#column-sort-order").find("option:selected").val();
@@ -914,5 +695,10 @@ function getRelativeURL(endpoint) {
 	return root + endpoint;
 }
 
+function getLocalTimeStr(epoch_timestamp) {
+	let local_date = new Date(parseInt(epoch_timestamp) * 1000)
+	local_date = Intl.DateTimeFormat("en-GB", {dateStyle: "medium", timeStyle: "medium"}).format(local_date);
+	return local_date
+}
 
 });
\ No newline at end of file

From 908544be2bdb83e6bfc3a2db5d8bed1e870b4ce0 Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Tue, 20 Aug 2024 17:59:20 +0200
Subject: [PATCH 144/204] Style changes in Explorer

---
 .../templates/components/result-details.html  |  4 ++--
 webtool/templates/explorer/explorer.html      |  3 +++
 .../templates/explorer/post-annotations.html  | 21 ++++++++++++++++---
 webtool/views/views_explorer.py               |  7 ++++---
 4 files changed, 27 insertions(+), 8 deletions(-)

diff --git a/webtool/templates/components/result-details.html b/webtool/templates/components/result-details.html
index 6c3a6a8f2..485b0279d 100644
--- a/webtool/templates/components/result-details.html
+++ b/webtool/templates/components/result-details.html
@@ -117,10 +117,10 @@ <h2 class="blocktitle{% if current_user.is_authenticated and (__user_config("pri
                         <dd>
                             {% set annotations = dataset.get_annotations() %}
                             {% if annotations %}
-                                {{ annotations|length|numberify }} item{% if annotations|length > 1 %}s{% endif %} annotated with fields
+                                {{ annotations|length|numberify }} annotation{% if annotations|length > 1 %}s{% endif %}
                             {% endif %}
                             {% for annotation_field in annotation_fields.items() %}
-                                <span class="property-badge">{{ annotation_field[1].type }}</span> {{ annotation_field[1].label }}
+                                <span class="property-badge">{{ annotation_field[1].label }}</span>
                             {% endfor %}
                         </dd>
                     </div>
diff --git a/webtool/templates/explorer/explorer.html b/webtool/templates/explorer/explorer.html
index eb31de870..7301190f4 100644
--- a/webtool/templates/explorer/explorer.html
+++ b/webtool/templates/explorer/explorer.html
@@ -10,6 +10,7 @@
 
 <!-- Load some variables for which we need Jinja2 -->
 <script type="text/javascript">
+
 // Annotation fields
 var annotation_fields = {% if annotation_fields %}{{ annotation_fields | safe }}{% else %}""{% endif %}
 
@@ -26,6 +27,8 @@
 
 </script>
 
+    <span id="current-username" style="display: none">{{ current_user.get_name() }}</span>
+
 <!-- Don't show certain data if the dataset is pseudonymised -->
 {% set pseudonymised = True if dataset.parameters and dataset.parameters.get('pseudonymise', False) %}
 
diff --git a/webtool/templates/explorer/post-annotations.html b/webtool/templates/explorer/post-annotations.html
index 6112397c7..34dc8d429 100644
--- a/webtool/templates/explorer/post-annotations.html
+++ b/webtool/templates/explorer/post-annotations.html
@@ -51,23 +51,38 @@
 			{% endif %}
 
             {# Tooltip with metadata on the annotation #}
-            {% if annotation.author or annotation.timestamp or annotation.metadata %}
+            {% if annotation.author or annotation.author_original or annotation.timestamp or annotation.metadata %}
                 <button class="tooltip-trigger" aria-controls="tooltip-annotation-metadata-{{ post.id }}-{{ annotation.field_id }}">?</button>
                 <p role="tooltip" id="tooltip-annotation-metadata-{{ post.id }}-{{ annotation.field_id }}" aria-hidden="true">
-                {% if annotation.author or annotation.timestamp %}
-                    Last edited by{% if annotation.by_processor %} processor{% endif %} {% if annotation.author %}<strong>{{ annotation.author }}</strong>{% endif %}{% if annotation.timestamp %} on <span class="timestamp-to-convert">{{ annotation.timestamp }}</span>{% endif %}
+                {% if annotation.author_original %}
+                    <span class="tooltip-line">Created by {% if annotation.by_processor %} processor{% endif %} {% if annotation.author_original %}<strong>{{ annotation.author_original }}</strong>{% endif %}
+                    {% if annotation.timestamp_created %}
+                        on <span class="timestamp-created timestamp-to-convert">{{ annotation.timestamp_created }}</span>
+                    {% endif %}</span>
                 {% endif %}
+                {% if annotation.author != annotation.author_original or annotation.timestamp != annotation.timestamp_created %}
+                    <span class="tooltip-line">Edited by{% if annotation.by_processor %} processor{% endif %} {% if annotation.author %}<strong><span class="annotation-author">{{ annotation.author }}</span></strong>{% endif %}{% if annotation.timestamp %} on <span class="timestamp-edited timestamp-to-convert">{{ annotation.timestamp }}</span>{% endif %}
+                {% endif %}</span>
                 {# Extra stuff we may want to display #}
                 {% if annotation.metadata %}
                     {% set metadata = annotation.metadata | fromjson %}
                     {% if metadata.get("processor-parameters") %}
+                        <span class="tooltip-line">
                         {% for parameter in metadata["processor-parameters"] %}
                             <span class="property-badge"><i class="fa fa-check" aria-hidden="true"></i> {{ parameter }}</span>
                         {% endfor %}
+                        </span>
                     {% endif %}
                 </p>
                 {% endif %}
             {% endif %}
+
+            {# Store some invisible data here to we can retrieve in with JS #}
+            <div class="annotation-data" style="display: none">
+                <span class="epoch-timestamp-edited">{{ annotation.timestamp }}</span>
+                <span class="annotation-author">{{ annotation.author }}</span>
+            </div>
+
 			</div>
 		{% endfor %}
 	{% endif %}
diff --git a/webtool/views/views_explorer.py b/webtool/views/views_explorer.py
index abc9c5075..ab48ffd73 100644
--- a/webtool/views/views_explorer.py
+++ b/webtool/views/views_explorer.py
@@ -19,13 +19,14 @@
 config = ConfigWrapper(config, user=current_user, request=request)
 api_ratelimit = limiter.shared_limit("45 per minute", scope="api")
 
-@app.route('/results/<string:dataset_key>/explorer/', defaults={'page': 1})
-@app.route('/results/<string:dataset_key>/explorer/page/<int:page>')
+
+@app.route("/results/<string:dataset_key>/explorer/", defaults={"page": 1, "show_annotations": False})
+@app.route("/results/<string:dataset_key>/explorer/page/<int:page>")
 @api_ratelimit
 @login_required
 @setting_required("privileges.can_use_explorer")
 @openapi.endpoint("explorer")
-def explorer_dataset(dataset_key: str, page=1):
+def explorer_dataset(dataset_key: str, page=1, show_annotations=False):
 	"""
 	Show posts from a dataset
 

From 5e77fe27e2accf6ec0fab4fc0356ae501b189398 Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Wed, 21 Aug 2024 17:58:04 +0200
Subject: [PATCH 145/204] Redesign annotation field input controls, make them
 sortable, plus some other small fixes

---
 webtool/static/css/dataset-page.css           |  44 +++++--
 webtool/static/js/explorer.js                 | 121 +++++++++---------
 webtool/static/js/fourcat.js                  |   1 +
 .../templates/explorer/annotation-field.html  |  40 ------
 .../templates/explorer/annotation-fields.html |   0
 .../explorer/annotations-editor.html          |  80 ++++++------
 webtool/views/views_explorer.py               |  25 ----
 7 files changed, 140 insertions(+), 171 deletions(-)
 delete mode 100644 webtool/templates/explorer/annotation-field.html
 create mode 100644 webtool/templates/explorer/annotation-fields.html

diff --git a/webtool/static/css/dataset-page.css b/webtool/static/css/dataset-page.css
index bbd96c7bf..308fdfa78 100644
--- a/webtool/static/css/dataset-page.css
+++ b/webtool/static/css/dataset-page.css
@@ -697,25 +697,41 @@ body.image-preview {
 }
 
 /* Explorer view */
-#annotation-fields-editor {
-    height: 0;
-    overflow: hidden;
+#annotation-fields-editor-controls {
+
 }
 
-#annotation-fields-editor-controls {
-    display: grid;
-    grid-template-columns: auto auto auto;
+#annotation-fields-editor-controls li {
+    position: relative;
+    background: white;
+}
+
+#annotation-fields-editor-controls li:not(:last-child) {
+  border-bottom: 1px dotted var(--contrast-dark);
+}
+
+.annotation-field > span, .annotation-field > i, .annotation-field > select {
+    vertical-align: top;
 }
 
-#annotation-fields-editor-controls>div {
-    border-bottom: 1px solid var(--contrast-bright);
+.annotation-field > i {
+    padding-top: 10px;
+}
+
+.option-fields {
+    display: inline-block;
+    max-width: 250px;
+}
+
+.option-field {
+    display: inline-block;
 }
 
 #edit-annotation-fields #input-warning {
     color: var(--accent-error);
 }
 
-/* Remove all styles for explorer posts */
+/* Remove all styles for Explorer posts */
 /* these ought to be defined specifically, */
 /* and 4CAT styles shouldn't interfere. */
 #explorer-posts, #explorer-posts > ol li {
@@ -743,6 +759,14 @@ body.image-preview {
     min-width: 140px;
 }
 
-.annotation-field-label.invalid {
+.annotation-field-label.invalid, .option-field > input.invalid {
     border: 1px solid red;
+}
+
+#edit-annotation-fields {
+    padding: 0.5em
+}
+
+.delete-input {
+    float: right;
 }
\ No newline at end of file
diff --git a/webtool/static/js/explorer.js b/webtool/static/js/explorer.js
index 066266682..7df7ca9a2 100644
--- a/webtool/static/js/explorer.js
+++ b/webtool/static/js/explorer.js
@@ -74,11 +74,9 @@ const annotations = {
 		// Delete an entire annotation input
 		// We're in a grid of threes, so this involves three divs
 		editor_controls.on("click", ".annotation-field > .delete-input", function(){
-				let parent_div = $(this).parent().parent();
-				parent_div.next().remove(); // Input type
-				parent_div.next().remove(); // Options
-				parent_div.remove();		// Label
-			});
+			let parent_div = $(this).parent();
+			parent_div.remove();
+		});
 
 		// Make saving available when annotation fields are changed
 		editor_controls.on("click", ".delete-option-field", function() {
@@ -144,25 +142,20 @@ const annotations = {
 
 	toggleField: function (el) {
 		// Change the type of input fields when switching in the dropdown
-
 		let type = $(el).val();
-
-		let options = $(el).parent().parent().next();
-		let option_fields = options.find(".option-field");
-
+		let options = $(el).parent().parent().find(".option-fields");
 		if (type === "text" || type === "textarea") {
-			option_fields.remove();
+			options.remove();
 		}
 		else if (type === "dropdown" || type === "checkbox") {
-			if (option_fields.length === 0) {
+			if (options.children().length === 0) {
 				options.append(annotations.getInputField);
 			}
 		}
 	},
 
 	addOptions: function (el){
-		// Dynamically a new options for dropdowns and checkboxes
-
+		// Dynamically a new options for dropdowns and checkboxes in the fields editor.
 		// If text is added to a field, and there are 
 		// no empty fields available, add a new one.
 		let no_empty_fields = true;
@@ -232,25 +225,25 @@ const annotations = {
 
 		// Parse information from the annotations editor.
 		$(".annotation-field").each(function(){
-			// To align the input form, we're in a grid of threes:
-			// label, input type, options.
-			// Navigate the DOM to get these elements:
-			let label_field = $(this).children(".annotation-field-label");
-			let type_field = $(this).parent().next();
-			let options_field = $(this).parent().next().next();
 
+			let ann_field = $(this);
+
+			let label_field = ann_field.find(".annotation-field-label");
+			let type = ann_field.find(".annotation-field-type").val();
+			let option_fields = ann_field.find(".option-fields");
 			let label = label_field.val().replace(/\s+/g, ' ');
+			let no_options_added = false
 
 			// Get the ID of the field, so we
 			// can later check if it already exists.
-			let field_id = this.id.split("-")[1];
+			let field_id = ann_field.attr("id").split("-")[1];
 
 			// Make sure the inputs have a label
 			if (!label.length > 0) {
 				label_field.addClass("invalid");
 				warning  = "Field labels can't be empty";
 			}
-			// Make sure the names can't be duplicates
+			// Make sure the labels can't be duplicates
 			else if (labels_added.includes(label)) {
 				warning = "Field labels must be unique";
 				label_field.addClass("invalid");
@@ -262,29 +255,27 @@ const annotations = {
 				label_field.addClass("invalid");
 			}
 
-			// Set the types and values of the annotation
-			type = type_field.find(".annotation-field-type").val();
-
 			// Keep track of the labels we've added
-			labels_added.push(label)
-
+			labels_added.push(label);
 			if (type === "text" || type === "textarea") {
 				annotation_fields[field_id] = {"type": type, "label": label};
 			}
 			// Add options for dropdowns and checkboxes
-			else {
+			else if (option_fields.length > 0) {
 				let options = []; // List of dicts, because it needs to be ordered
 				let option_labels = [];
-				let no_options_added = true;
 
-				options_field.find(".option-field > input").each(function(){
-					let option_label = $(this).val();
-					let option_id = $(this).id.replace("input-", "");
+				no_options_added = true;
 
+				option_fields.find(".option-field").each(function(){
+					let option_input = $(this).find("input");
+					let option_label = option_input.val();
+					let option_id = option_input.attr("id").replace("option-", "");
+					// New option label
 					if (!option_labels.includes(option_label) && option_label.length > 0) {
 
 						// We're using a unique key for options as well.
-						option = {}
+						let option = {}
 						option[option_id] = option_label
 						options.push(option);
 						option_labels.push(option_label);
@@ -300,10 +291,9 @@ const annotations = {
 					// But there must be at least one field in there.
 
 				});
-
 				if (no_options_added) {
 					warning = "At least one field must be added";
-					$(this).find(".option-field > input").first().addClass("invalid");
+					ann_field.find(".option-fields .option-field input").first().addClass("invalid");
 				}
 
 				if (Object.keys(options).length > 0) {
@@ -322,13 +312,13 @@ const annotations = {
 
 	parseAnnotation: function(el) {
 		/*
-		Converts the DOM objects of an annotation field
-		to an annotation Object.
+		Converts the DOM objects of an annotation
+		to an annotation object.
 
-		Must be given a .post-annotation div element
+		Must be given a .post-annotation div element.
 
 		*/
-		console.log(el)
+
 		let ann_input = el.find(".post-annotation-input");
 		let ann_classes = el.attr("class").split(" ");
 		let ann_input_classes = ann_input.attr("class").split(" ");
@@ -390,7 +380,6 @@ const annotations = {
 			"by_processor": false, // Explorer annotations are human-made!
 			"timestamp": timestamp
 		}
-		console.log(annotation)
 		return annotation
 	},
 
@@ -399,7 +388,6 @@ const annotations = {
 
 		// First we collect the annotation information from the editor
 		let annotation_fields = annotations.parseAnnotationFields(e);
-		let fields_to_add = {};
 
 		// Show an error message if the annotation fields were not valid.
 		if (typeof annotation_fields == "string") {
@@ -596,24 +584,25 @@ const annotations = {
 		these have no IDs yet, we'll add a hashed database-label string when saving.
 		*/
 
-		let annotation_field = `<div>
-             <dd class="annotation-fields-row annotation-field">
-                 <input type="text" id="field-undefinedrandomint"
-                 class="annotation-field-label" name="annotation-field-label" placeholder="Field label">
-                 <a class="button-like-small delete-input"><i class="fas fa-trash"></i></a>
-             </dd>
-         </div>
-         <div>
-             <dd>
-                 <select name="annotation-field-type" class="annotation-field-type">
-                     <option class="annotation-field-option" value="text" selected>Text</option>
-                     <option class="annotation-field-option" value="textarea">Text (large)</option>
-                     <option class="annotation-field-option" value="checkbox">Checkbox</option>
-                     <option class="annotation-field-option" value="dropdown">Dropdown</option>
-                 </select>
-             </dd>
-         </div><div></div>`.replace("randomint", Math.floor(Math.random() * 100000000).toString());
-		$(annotation_field).insertBefore($("#edit-annotation-fields"));
+		let annotation_field = `
+			<li class="annotation-field" id="field-randomint">
+				<i class="fa fa-fw fa-sort handle" aria-hidden="true"></i>
+				 <span class="annotation-fields-row">
+					<input type="text" class="annotation-field-label" name="annotation-field-label" placeholder="Label">
+				</span>
+				 <span>
+					<select name="annotation-field-type" class="annotation-field-type">
+						<option class="annotation-field-option" value="text" selected>Text</option>
+						<option class="annotation-field-option" value="textarea">Text (large)</option>
+						<option class="annotation-field-option" value="dropdown">Single choice</option>
+						<option class="annotation-field-option" value="checkbox">Multiple choice</option>
+					</select>
+				</span>
+				<span class="option-fields"></span>
+				<a class="button-like-small delete-input"><i class="fas fa-trash"></i></a>
+            </li>
+			`.replace("randomint", Math.floor(Math.random() * 100000000).toString());
+		$("#annotation-field-settings").append(annotation_field);
 	},
 
 	getInputField: function(id){
@@ -621,7 +610,7 @@ const annotations = {
 		if (id === undefined || id === 0) {
 			id = Math.floor(Math.random() * 100000000).toString();
 		}
-		return "<div class='option-field'><input type='text' id='input-" + id + "' placeholder='Value'></div>";
+		return "<span class='option-field'><input type='text' id='option-" + id + "' placeholder='Option'></span>";
 	},
 
 	markChanges: function(el) {
@@ -652,6 +641,18 @@ const page_functions = {
 			el.innerText = getLocalTimeStr(el.innerText);
 		});
 
+		// Make annotation field editor sortable
+		$('#annotation-field-settings').sortable({
+            cursor: "s-resize",
+            handle: ".handle",
+            items: "li",
+            axis: "y",
+			containment: "#annotation-field-settings",
+			change: function() {
+				$("#apply-annotation-fields").removeClass("disabled");
+			}
+        });
+
 		// Reorder the dataset when the sort type is changed
 		$(".sort-select").on("change", function(){
 
diff --git a/webtool/static/js/fourcat.js b/webtool/static/js/fourcat.js
index df56bca60..e36793c8b 100644
--- a/webtool/static/js/fourcat.js
+++ b/webtool/static/js/fourcat.js
@@ -1555,6 +1555,7 @@ const ui_helpers = {
             cursor: 'ns-resize',
             handle: '.handle',
             items: '.implicit, .explicit',
+            containment: '#tag-order',
             axis: 'y',
             update: function(e, ui) {
                 let tag_order = Array.from(document.querySelectorAll('#tag-order li[data-tag]')).map(t => t.getAttribute('data-tag')).join(',');
diff --git a/webtool/templates/explorer/annotation-field.html b/webtool/templates/explorer/annotation-field.html
deleted file mode 100644
index 1f944d366..000000000
--- a/webtool/templates/explorer/annotation-field.html
+++ /dev/null
@@ -1,40 +0,0 @@
-{% set annotation_type = annotation_field["type"] %}
-
-{% set label = annotation_field["label"] %}
-
-<div>
-    <dd class="annotation-fields-row annotation-field" id="field-{{ field }}">
-        <input type="text" class="annotation-field-label" name="annotation-field-label" placeholder="Field label" value="{{ label }}">
-        <a class="button-like-small delete-input"><i class="fas fa-trash"></i></a>
-    </dd>
-</div>
-<div>
-    <dd>
-        <select name="annotation-field-type" class="annotation-field-type">
-            <option class="annotation-field-option" value="text" {% if annotation_type == "text" %}selected{% endif %}>Text</option>
-            <option class="annotation-field-option" value="textarea" {% if annotation_type == "textarea" %}selected{% endif %}>Text (large)</option>
-            <option class="annotation-field-option" value="checkbox" {% if annotation_type == "checkbox" %}selected{% endif %}>Checkbox</option>
-            <option class="annotation-field-option" value="dropdown" {% if annotation_type == "dropdown" %}selected{% endif %}>Dropdown</option>
-        </select>
-    </dd>
-</div>
-
-{% if annotation_type == "dropdown" or annotation_type == "checkbox" %}
-<div class="option-fields">
-    <dd>
-            {% for option in annotation_fields[field]["options"] %}
-            {% set option_id = option.keys() | first %}
-            {% set option_label = option.values() | first %}
-            <div class="option-field">
-                <input type="text" id="input-{{ option_id }}" value="{{ option_label }}">
-                <a class="button-like-small delete-option-field"><i class="fas fa-trash"></i></a>
-            </div>
-            {% endfor %}
-            <div class="option-field">
-                <input type="text" id="input-{{ range(1, 100000000) | random }}" placeholder="Value">
-            </div>
-    </dd>
-</div>
-{% else %}
-<div class="option-fields"></div>
-{% endif %}
\ No newline at end of file
diff --git a/webtool/templates/explorer/annotation-fields.html b/webtool/templates/explorer/annotation-fields.html
new file mode 100644
index 000000000..e69de29bb
diff --git a/webtool/templates/explorer/annotations-editor.html b/webtool/templates/explorer/annotations-editor.html
index bb75e6bff..68486cbba 100644
--- a/webtool/templates/explorer/annotations-editor.html
+++ b/webtool/templates/explorer/annotations-editor.html
@@ -3,41 +3,49 @@
 <!-- <p id="notice"><em>Note: Changing input types will overwrite existing annotations for the field</em></p> -->
 <!-- <span id="annotation-fields"></span> -->
 <div id="annotation-fields-editor-controls">
-	<div>
-		<dd>
-			<em>Label</em>
-			<button class="tooltip-trigger" aria-controls="tooltip-annotation-editor-label">?</button>
-			<p role="tooltip" id="tooltip-annotation-editor-label" aria-hidden="true">The label of the annotation field. Can't be the same as an already existing column.</p>
-		</dd>
-	</div>
-	<div>
-		<dd>
-			<em>Input type</em>
-			<button class="tooltip-trigger" aria-controls="tooltip-annotation-editor-type">?</button>
-			<p role="tooltip" id="tooltip-annotation-editor-type" aria-hidden="true">The type of annotation field. Available types
-			include a text field, a large text field, checkboxes, and a dropdown menu.</p>
-		</dd>
-	</div>
-	<div>
-		<dd>
-			<em>Options</em>
-			<button class="tooltip-trigger" aria-controls="tooltip-annotation-editor-options">?</button>
-			<p role="tooltip" id="tooltip-annotation-editor-options" aria-hidden="true">Available options for checkbox and dropdown fields.</p>
-		</dd>
-	</div>
-
+    <ol id="annotation-field-settings" class="ui-sortable">
 	{% if annotation_fields %}
-        {% for field in annotation_fields %}
-            {% set annotation_field = annotation_fields[field] %}
-            {% include "explorer/annotation-field.html" %}
-        {% endfor %}
-    {% endif %}
+    {% for field in annotation_fields %}
+        {% set annotation_field = annotation_fields[field] %}
+        {% set annotation_type = annotation_field["type"] %}
+        {% set label = annotation_field["label"] %}
+        <li class="annotation-field" id="field-{{ field }}">
+            <i class="fa fa-fw fa-sort handle" aria-hidden="true"></i>
+            <span class="annotation-field-input">
+                <input type="text" class="annotation-field-label" name="annotation-field-label" placeholder="Label" value="{{ label }}">
+            </span>
+            <span>
+                <select name="annotation-field-type" class="annotation-field-type">
+                    <option class="annotation-field-option" value="text" {% if annotation_type == "text" %}selected{% endif %}>Text</option>
+                    <option class="annotation-field-option" value="textarea" {% if annotation_type == "textarea" %}selected{% endif %}>Text (large)</option>
+                    <option class="annotation-field-option" value="dropdown" {% if annotation_type == "dropdown" %}selected{% endif %}>Single choice</option>
+                    <option class="annotation-field-option" value="checkbox" {% if annotation_type == "checkbox" %}selected{% endif %}>Multiple choice</option>
+                </select>
+            </span>
 
-	<div id="edit-annotation-fields">
-		<dd>
-			<a class="button-like-small" id="new-annotation-field"><i class="fas fa-plus"></i> New field</a>
-			<a class="button-like-small disabled" id="apply-annotation-fields"><i class="fa-solid fa-check"></i> Apply</a>
-			<br><span class="hidden" id="input-warning"></span>
-		</dd>
-	</div>
-</div>
\ No newline at end of file
+            <span class="option-fields">
+            {% if annotation_type == "dropdown" or annotation_type == "checkbox" %}
+                {% for option in annotation_fields[field]["options"] %}
+                    {% set option_id = option.keys() | first %}
+                    {% set option_label = option.values() | first %}
+                    <span class="option-field">
+                    <input type="text"  placeholder="Option" id="input-{{ option_id }}" value="{{ option_label }}">
+                    <a class="button-like-small delete-option-field"><i class="fas fa-trash"></i></a>
+                    </span>
+                {% endfor %}
+                <span class="option-field">
+                    <input type="text" id="option-{{ range(1, 100000000) | random }}" placeholder="Value">
+                </span>
+            {% endif %}
+            </span>
+            <a class="button-like-small delete-input"><i class="fas fa-trash"></i></a>
+        </li>
+    {% endfor %}
+    {% endif %}
+    </ol>
+    <div id="edit-annotation-fields">
+        <a class="button-like-small" id="new-annotation-field"><i class="fas fa-plus"></i> New field</a>
+        <a class="button-like-small disabled" id="apply-annotation-fields"><i class="fa-solid fa-check"></i> Apply</a>
+        <br><span class="hidden" id="input-warning"></span>
+    </div>
+</div>
diff --git a/webtool/views/views_explorer.py b/webtool/views/views_explorer.py
index ab48ffd73..586d3a7e1 100644
--- a/webtool/views/views_explorer.py
+++ b/webtool/views/views_explorer.py
@@ -210,31 +210,6 @@ def explorer_save_annotations(dataset_key: str):
 	dataset.save_annotations(annotations, overwrite=True)
 	return "success"
 
-
-@app.route("/explorer/get_annotation_field", methods=["GET"])
-@api_ratelimit
-@login_required
-@setting_required("privileges.can_run_processors")
-@setting_required("privileges.can_use_explorer")
-@openapi.endpoint("explorer")
-def get_annotation_field():
-	"""
-	Returns an annotation field input div
-
-	:return-error 406:  If the list of subqueries could not be parsed.
-	"""
-	try:
-		annotation_field = json.loads(request.args.get("annotation_field"))
-	except (TypeError, json.decoder.JSONDecodeError):
-		return error(406, error="Unexpected format for annotation field.")
-
-	html = render_template("explorer/annotation-field.html", annotation_field=annotation_field)
-	return jsonify({
-		"status": "success",
-		"html": html}
-	)
-
-
 def sort_and_iterate_items(dataset: DataSet, sort="", reverse=False, **kwargs) -> dict:
 	"""
 	Loop through both csv and NDJSON files.

From af71c6c2f54892651c1cfb514051256a224f1fae Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Thu, 22 Aug 2024 17:58:21 +0200
Subject: [PATCH 146/204] Fix and simplify annotation field saving, re-enable
 saving options (and keep 'em ordered)

---
 common/lib/annotation.py                      |   6 +-
 common/lib/dataset.py                         |  10 +-
 webtool/static/css/dataset-page.css           |   8 +-
 webtool/static/js/explorer.js                 | 112 +++++++-----------
 .../templates/explorer/annotation-fields.html |   0
 .../explorer/annotations-editor.html          |   8 +-
 webtool/templates/explorer/controls.html      |   4 +-
 .../templates/explorer/post-annotations.html  |   8 +-
 webtool/views/views_explorer.py               |  15 ++-
 9 files changed, 77 insertions(+), 94 deletions(-)
 delete mode 100644 webtool/templates/explorer/annotation-fields.html

diff --git a/common/lib/annotation.py b/common/lib/annotation.py
index 89aa95e56..153f1c6ba 100644
--- a/common/lib/annotation.py
+++ b/common/lib/annotation.py
@@ -311,6 +311,7 @@ def update_annotations_via_fields(dataset_key, old_fields: dict, new_fields: dic
 
                         # Options are saved in a dict with IDs as keys and labels as values.
                         for old_option_id, old_option in old_options.items():
+
                             # Renamed option label
                             if old_option_id in new_options and old_option != new_options[old_option_id]:
                                 options_to_update[old_option] = new_options[old_option_id]  # Old label -> new label
@@ -340,6 +341,9 @@ def update_annotations_via_fields(dataset_key, old_fields: dict, new_fields: dic
                 # Write to db
                 for column, update_value in updates.items():
 
+                    if column == "options":
+                        update_value = json.dumps(update_value)
+
                     # Change values of columns
                     updates = db.update("annotations", {column: update_value},
                                         where={"dataset": dataset_key, "field_id": field_id})
@@ -350,7 +354,7 @@ def update_annotations_via_fields(dataset_key, old_fields: dict, new_fields: dic
                     if column == "options":
 
                         inserted_options = db.fetchall("SELECT id, value FROM annotations "
-                                                      "WHERE dataset = %s and field_id = %s" % (dataset_key, field_id))
+                                                      "WHERE dataset = '%s' and field_id = '%s'" % (dataset_key, field_id))
                         new_inserts = []
                         for inserted_option in inserted_options:
 
diff --git a/common/lib/dataset.py b/common/lib/dataset.py
index 5dc1fa843..1780a1ced 100644
--- a/common/lib/dataset.py
+++ b/common/lib/dataset.py
@@ -1776,7 +1776,7 @@ def save_annotation_fields(self, new_fields: dict, add=False) -> int:
 		# Get existing annotation fields to see if stuff changed.
 		old_fields = self.get_annotation_fields()
 		changes = False
-
+		
 		# Do some validation
 		# Annotation field must be valid JSON.
 		try:
@@ -1784,7 +1784,12 @@ def save_annotation_fields(self, new_fields: dict, add=False) -> int:
 		except ValueError:
 			raise AnnotationException("Can't save annotation fields: not valid JSON (%s)" % new_fields)
 
+		# No duplicate IDs
+		if len(new_fields) != len(set(new_fields)):
+			raise AnnotationException("Can't save annotation fields: field IDs must be unique")
+
 		# Annotation fields must at minimum have `type` and `label` keys.
+		seen_labels = []
 		for field_id, annotation_field in new_fields.items():
 			if not isinstance(field_id, str):
 				raise AnnotationException("Can't save annotation fields: field ID %s is not a valid string" % field_id)
@@ -1792,6 +1797,9 @@ def save_annotation_fields(self, new_fields: dict, add=False) -> int:
 				raise AnnotationException("Can't save annotation fields: all fields must have a label" % field_id)
 			if "type" not in annotation_field:
 				raise AnnotationException("Can't save annotation fields: all fields must have a type" % field_id)
+			if annotation_field["label"] in seen_labels:
+				raise AnnotationException("Can't save annotation fields: labels must be unique (%s)" % annotation_field["label"])
+			seen_labels.append(annotation_field["label"])
 
 			# Keep track of whether existing fields have changed; if so, we're going to
 			# update the annotations table.
diff --git a/webtool/static/css/dataset-page.css b/webtool/static/css/dataset-page.css
index 308fdfa78..61916e156 100644
--- a/webtool/static/css/dataset-page.css
+++ b/webtool/static/css/dataset-page.css
@@ -696,9 +696,13 @@ body.image-preview {
     cursor: zoom-out;
 }
 
-/* Explorer view */
-#annotation-fields-editor-controls {
+/* EXPLORER VIEW */
+#annotation-fields-editor {
+    height: 0;
+    overflow-y: hidden;
+}
 
+#annotation-fields-editor-controls {
 }
 
 #annotation-fields-editor-controls li {
diff --git a/webtool/static/js/explorer.js b/webtool/static/js/explorer.js
index 7df7ca9a2..41dbe26a6 100644
--- a/webtool/static/js/explorer.js
+++ b/webtool/static/js/explorer.js
@@ -53,21 +53,21 @@ const annotations = {
 
 		// Keep track of when the annotation fields were edited.
 		editor_controls.on("click", "#apply-annotation-fields, .delete-input, .delete-option-field", function() {
-			$("#apply-annotation-fields").removeClass("disabled");
+			edits_made = true;
+			annotations.enableSaving();
 		});
 		editor_controls.on("change keydown", "input, select", function() {
-			$("#apply-annotation-fields").removeClass("disabled");
+			edits_made = true;
+			annotations.enableSaving();
 		});	
 
 		// Show and hide annotations
 		$("#toggle-annotations").on("click", function(){
-			if (!$(this).hasClass("disabled")) {
-				if ($(this).hasClass("shown")) {
-					annotations.hideAnnotations();
-				}
-				else {
-					annotations.showAnnotations();
-				}
+			if ($(this).hasClass("shown")) {
+				annotations.hideAnnotations();
+			}
+			else {
+				annotations.showAnnotations();
 			}
 		});
 
@@ -103,8 +103,8 @@ const annotations = {
 		
 		// Make saving available when annotations are changed
 		let post_annotations = $(".post-annotations");
-		post_annotations.on("keydown", "input, textarea", function() { annotations.enableSaving(); edits_made = true;});
-		post_annotations.on("click", "option, input[type=checkbox], label", function() { annotations.enableSaving(); edits_made = true;});
+		post_annotations.on("keydown", "input, textarea", function() { edits_made = true;});
+		post_annotations.on("click", "option, input[type=checkbox], label", function() { edits_made = true;});
 
 		// Keep track of whether the annotations are edited or not.
 		post_annotations.on("keydown change",
@@ -116,25 +116,21 @@ const annotations = {
 
 		// Save the annotations to the database
 		$("#save-annotations").on("click", function(){
-			if (!$(this).hasClass("disabled")) {
-				annotations.saveAnnotations();
-			}
+			annotations.saveAnnotations();
 		});
 
 		// Save unsaved annotations upon changing a page.
 		$('.page > a').click(function(){
-			if (!$("#save-annotations").hasClass('disabled')) {
-				annotations.saveAnnotations();
-			}
+			annotations.saveAnnotations();
 		})
 
 		// Check whether there's already fields saved for this dataset
 		annotations.fieldsExist();
 
-		// Save annotations every 10 seconds
+		// Save annotations every 10 seconds if changes have been made
 		setInterval(function() {
-			if (!$("#save-annotations").hasClass("disabled") && edits_made) {
-				annotations.saveAnnotations();
+			if (edits_made) {
+				//annotations.saveAnnotations();
 			}
 		}, 10000);
 
@@ -262,7 +258,7 @@ const annotations = {
 			}
 			// Add options for dropdowns and checkboxes
 			else if (option_fields.length > 0) {
-				let options = []; // List of dicts, because it needs to be ordered
+				let options = new Map(); // Map, because it needs to be ordered
 				let option_labels = [];
 
 				no_options_added = true;
@@ -271,13 +267,11 @@ const annotations = {
 					let option_input = $(this).find("input");
 					let option_label = option_input.val();
 					let option_id = option_input.attr("id").replace("option-", "");
+
 					// New option label
 					if (!option_labels.includes(option_label) && option_label.length > 0) {
-
 						// We're using a unique key for options as well.
-						let option = {}
-						option[option_id] = option_label
-						options.push(option);
+						options.set(option_id, option_label);
 						option_labels.push(option_label);
 						no_options_added = false;
 					}
@@ -296,10 +290,10 @@ const annotations = {
 					ann_field.find(".option-fields .option-field input").first().addClass("invalid");
 				}
 
-				if (Object.keys(options).length > 0) {
+				if (options.size > 0) {
 					// Strip whitespace from the input field key
 					label = label.replace(/\s+/g, ' ');
-					annotation_fields[field_id] = {"type": type, "label": label, "options": options};
+					annotation_fields[field_id] = {"type": type, "label": label, "options": Object.fromEntries(options)};
 				}
 			}
 		});
@@ -411,7 +405,6 @@ const annotations = {
 
 			// We store the annotation fields in the dataset table.
 			annotations.saveAnnotationFields(annotation_fields);
-			location.reload();
 		}
 	},
 
@@ -427,28 +420,25 @@ const annotations = {
 		annotations.fieldsExist();
 
 		let dataset_key = $("#dataset-key").text();
-
 		// AJAX the annotation forms
 		$.ajax({
 			url: getRelativeURL("explorer/save_annotation_fields/" + dataset_key),
 			type: "POST",
 			contentType: "application/json",
-			data:  JSON.stringify(annotation_fields),
-
+			data: JSON.stringify(annotation_fields),
 			success: function (response) {
-				// If the query is accepted by the server.
-				if (response === 'success') {
-					$("#annotations-editor-container").hide();
-					$("#apply-annotation-fields").addClass("disabled");
-				}
+				// If the query is accepted by the server...
 
-				// If the query is rejected by the server.
-				else {
-					annotations.warnEditor("Couldn't save annotation fields");
-				}
+				//location.reload(); // ...simply reload the page to render the template again
 			},
 			error: function (error) {
-				annotations.warnEditor(error);
+				if (error.status == 400) {
+					annotations.warnEditor(error.responseJSON.error);
+				}
+				else {
+					annotations.warnEditor("Server error, couldn't save annotation fields.")
+				}
+				$("#apply-annotation-fields").html("<i class='fa-solid fa-check'></i> Apply");
 			}
 		});
 	},
@@ -481,7 +471,6 @@ const annotations = {
 
 		let save_annotations = $("#save-annotations");
 		save_annotations.html("<i class='fas fa-circle-notch spinner'></i> Saving annotations")
-		annotations.disableSaving();
 
 		let code = ""
 
@@ -494,27 +483,16 @@ const annotations = {
 			success: function (response) {
 
 				if (response === 'success') {
-					code = response
-
-					annotations.enableSaving();
-					save_annotations.html("<i class='fas fa-save'></i> Annotations saved");
-					save_annotations.addClass("disabled");
-					//var old_annotation_fields = $("#annotation-field").each();
-					// alert(alert_message);
+					code = response;
 				}
 				else {
-					annotations.enableSaving();
-					save_annotations.html("<i class='fas fa-save'></i> Save annotations");
 					alert("Couldn't save annotations");
-					save_annotations.removeClass("disabled");
 					console.log(response);
 				}
+				save_annotations.html("<i class='fas fa-save'></i> Save annotations");
 			},
 			error: function (error) {
-				annotations.enableSaving();
 				save_annotations.html("<i class='fas fa-save'></i> Save annotations");
-				save_annotations.removeClass("disabled");
-				//alert("Couldn't save annotations");
 				console.log(error)
 			}
 		});
@@ -524,21 +502,12 @@ const annotations = {
 		// Annotation fields are sent by the server
 		// and saved in a script in the header.
 		// So we just need to check whether they're there.
-
-		if (Object.keys(annotation_fields).length < 1) {
-			$("#toggle-annotations").addClass("disabled");
-			return false;
-		}
-		else {
-			$("#toggle-annotations").removeClass("disabled");
-			return true;
-		}
+		return Object.keys(annotation_fields).length >= 1;
 	},
 
 	enableSaving: function(){
 		// Enable saving annotations to the database
 		$("#save-annotations, #save-to-dataset").removeClass("disabled");
-		$("#save-annotations").html("<i class='fas fa-save'></i> Save annotations");
 	},
 
 	disableSaving: function(){
@@ -547,7 +516,7 @@ const annotations = {
 	},
 
 	warnEditor: function(warning) {
-		
+		// Warns the annotation field editor if stuff's wrong
 		let warn_field = $("#input-warning");
 		warn_field.html(warning);
 		if (warn_field.hasClass("hidden")) {
@@ -559,7 +528,6 @@ const annotations = {
 	showAnnotations: function() {
 		let ta = $("#toggle-annotations");
 		ta.addClass("shown");
-		ta.removeClass("disabled");
 		ta.html("<i class='fas fa-eye-slash'></i> Hide annotations");
 		// Bit convoluted, but necessary to have auto height
 		let pa = $(".post-annotations");
@@ -585,7 +553,7 @@ const annotations = {
 		*/
 
 		let annotation_field = `
-			<li class="annotation-field" id="field-randomint">
+			<li class="annotation-field" id="field-tohashrandomint">
 				<i class="fa fa-fw fa-sort handle" aria-hidden="true"></i>
 				 <span class="annotation-fields-row">
 					<input type="text" class="annotation-field-label" name="annotation-field-label" placeholder="Label">
@@ -614,8 +582,8 @@ const annotations = {
 	},
 
 	markChanges: function(el) {
-		// Adds current changes to a post annotation so we can save these later.
-		// Currently includes the time of edits and the username of the annotator
+		// Adds info on edits on post annotation to its element, so we can save these to the db later.
+		// Currently includes the time of edits and the username of the annotator.
 		let current_username = $("#current-username").html();
 		let current_date = Date.now() / 1000;
 		let input_field = el.find(".post-annotation-input");
@@ -641,7 +609,7 @@ const page_functions = {
 			el.innerText = getLocalTimeStr(el.innerText);
 		});
 
-		// Make annotation field editor sortable
+		// Make annotation field editor sortable with jQuery UI.
 		$('#annotation-field-settings').sortable({
             cursor: "s-resize",
             handle: ".handle",
@@ -649,7 +617,7 @@ const page_functions = {
             axis: "y",
 			containment: "#annotation-field-settings",
 			change: function() {
-				$("#apply-annotation-fields").removeClass("disabled");
+
 			}
         });
 
diff --git a/webtool/templates/explorer/annotation-fields.html b/webtool/templates/explorer/annotation-fields.html
deleted file mode 100644
index e69de29bb..000000000
diff --git a/webtool/templates/explorer/annotations-editor.html b/webtool/templates/explorer/annotations-editor.html
index 68486cbba..b03d62be8 100644
--- a/webtool/templates/explorer/annotations-editor.html
+++ b/webtool/templates/explorer/annotations-editor.html
@@ -25,11 +25,9 @@
 
             <span class="option-fields">
             {% if annotation_type == "dropdown" or annotation_type == "checkbox" %}
-                {% for option in annotation_fields[field]["options"] %}
-                    {% set option_id = option.keys() | first %}
-                    {% set option_label = option.values() | first %}
+                {% for option_id, option_label in annotation_fields[field]["options"].items() %}
                     <span class="option-field">
-                    <input type="text"  placeholder="Option" id="input-{{ option_id }}" value="{{ option_label }}">
+                    <input type="text"  placeholder="Option" id="option-{{ option_id }}" value="{{ option_label }}">
                     <a class="button-like-small delete-option-field"><i class="fas fa-trash"></i></a>
                     </span>
                 {% endfor %}
@@ -45,7 +43,7 @@
     </ol>
     <div id="edit-annotation-fields">
         <a class="button-like-small" id="new-annotation-field"><i class="fas fa-plus"></i> New field</a>
-        <a class="button-like-small disabled" id="apply-annotation-fields"><i class="fa-solid fa-check"></i> Apply</a>
+        <a class="button-like-small" id="apply-annotation-fields"><i class="fa-solid fa-check"></i> Apply</a>
         <br><span class="hidden" id="input-warning"></span>
     </div>
 </div>
diff --git a/webtool/templates/explorer/controls.html b/webtool/templates/explorer/controls.html
index 44d816b7f..98c8f01f1 100644
--- a/webtool/templates/explorer/controls.html
+++ b/webtool/templates/explorer/controls.html
@@ -23,8 +23,8 @@ <h2>
 						<li class="annotation-control-button">
 							<a class="button-like-small" id="toggle-annotation-fields"><i class="fas fa-edit"></i> Edit fields</a>
 						</li>
-						<li class="annotation-control-button"><a class="button-like-small{% if not annotation_fields %} disabled{% endif %}" id="toggle-annotations"><i class="fas fa-eye"></i> Show annotations</li></a>
-						<li class="annotation-control-button"><a class="button-like-small{% if annotation_fields and annotations %} disabled{% endif %}" id="save-annotations"><i class="fas fa-save"></i> {% if not annotations %}No annotations{% else %}Annotations saved{% endif %}</a>
+						<li class="annotation-control-button"><a class="button-like-small{% if not annotation_fields %} disabled{% endif %}" id="toggle-annotations"><i class="fas fa-eye"></i> Show annotations</a></li>
+						<li class="annotation-control-button"><a class="button-like-small {% if not annotation_fields %} disabled{% endif %}" id="save-annotations"><i class="fas fa-save"></i> Save annotations</a>
 						<button class="tooltip-trigger" aria-controls="tooltip-save-annotations">?</button>
 						<p role="tooltip" id="tooltip-save-annotations" aria-hidden="true">
 							Annotations are saved automatically every ten seconds. Use <strong><i class="fas fa-download"></i>Download csv</strong> on the dataset overview page to download a csv with annotations added as columns.
diff --git a/webtool/templates/explorer/post-annotations.html b/webtool/templates/explorer/post-annotations.html
index 34dc8d429..842ac3a4a 100644
--- a/webtool/templates/explorer/post-annotations.html
+++ b/webtool/templates/explorer/post-annotations.html
@@ -31,18 +31,14 @@
 				<select class="post-annotation-options select-{{ type }}" id="{{ field }}">
 				<option class='post-annotation-input' value=''></option>
 
-				{% for option in annotation_fields[field]["options"] %}
-					{% set option_id = option.keys() | first %}
-					{% set option_label = option.values() | first %}
+				{% for option_id, option_label in annotation_fields[field]["options"].items() %}
 					<option class="post-annotation-input option-{{ option_id }}" id="option-{{ option_id }}" value="{{ option_label }}" {% if option_label == annotation.value %}selected{% endif %}>{{ option_label }}</option>
 				{% endfor %}
 				</select>
 
 			{% elif type == 'checkbox' %}
 				<div class='post-annotation-options checkboxes-{{ field }}'>
-				{% for option in annotation_fields[field]["options"] %}
-					{% set option_id = option.keys() | first %}
-					{% set option_label = option.values() | first %}
+				{% for option_id, option_label in annotation_fields[field]["options"].items() %}
 					{% set checked = "checked" if option_label in annotation.value else "" %}
 
 					<input type='checkbox' class='post-annotation-input option-{{ option_id }}' id='option-{{ post.id }}-{{ option_id }}' value='{{ option_label }}' {{ checked }}><label for='option-{{ post.id }}-{{ option_id }}'>{{ option_label }}</label>
diff --git a/webtool/views/views_explorer.py b/webtool/views/views_explorer.py
index 586d3a7e1..7a6323a4a 100644
--- a/webtool/views/views_explorer.py
+++ b/webtool/views/views_explorer.py
@@ -13,7 +13,7 @@
 from webtool.lib.helpers import error, setting_required
 from common.lib.dataset import DataSet
 from common.lib.helpers import convert_to_float, hash_values
-from common.lib.exceptions import DataSetException
+from common.lib.exceptions import DataSetException, AnnotationException
 from common.config_manager import ConfigWrapper
 
 config = ConfigWrapper(config, user=current_user, request=request)
@@ -148,7 +148,7 @@ def explorer_dataset(dataset_key: str, page=1, show_annotations=False):
 @setting_required("privileges.can_run_processors")
 @setting_required("privileges.can_use_explorer")
 @openapi.endpoint("explorer")
-def explorer_save_annotation_fields(dataset_key: str) -> str:
+def explorer_save_annotation_fields(dataset_key: str):
 	"""
 	Save the annotation fields of a dataset to the datasets table.
 
@@ -174,14 +174,19 @@ def explorer_save_annotation_fields(dataset_key: str) -> str:
 	# dataset key and the input label (should be unique)
 	field_keys = list(annotation_fields.keys())
 	for field_id in field_keys:
-		if "undefined" in field_id:
+		if "tohash" in field_id:
 			new_field_id = hash_values(dataset_key + annotation_fields[field_id]["label"])
 			annotation_fields[new_field_id] = annotation_fields[field_id]
 			del annotation_fields[field_id]
 
-	dataset.save_annotation_fields(annotation_fields)
+	try:
+		fields_saved = dataset.save_annotation_fields(annotation_fields)
+	except AnnotationException as e:
+		# If anything went wrong with the annotation field saving, return an error.
+		return jsonify(error=str(e)), 400
 
-	return "success"
+	# Else return the amount of fields saved.
+	return str(fields_saved)
 
 @app.route("/explorer/save_annotations/<string:dataset_key>", methods=["POST"])
 @api_ratelimit

From a417283d399b173a56d2bb58821401c1a564484c Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Thu, 22 Aug 2024 20:52:03 +0200
Subject: [PATCH 147/204] Forgot a postgresql field in migrate script

---
 helper-scripts/migrate/migrate-1.45-1.46.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/helper-scripts/migrate/migrate-1.45-1.46.py b/helper-scripts/migrate/migrate-1.45-1.46.py
index 719507098..2894c1cb6 100644
--- a/helper-scripts/migrate/migrate-1.45-1.46.py
+++ b/helper-scripts/migrate/migrate-1.45-1.46.py
@@ -105,7 +105,7 @@
     count = 0
     skipped_count = 0
 
-    columns = "id,dataset,field_id,item_id,timestamp,timestamp_created,label,type,options,value,author,by_processor,metadata"
+    columns = "id,dataset,field_id,item_id,timestamp,timestamp_created,label,type,options,value,author,author_original,by_processor,metadata"
 
     # Each row are **all** annotations per dataset
     for row in annotations:
@@ -167,6 +167,7 @@
                     json.dumps(options) if options else "",    # options; each option has a key and a value.
                     value,                  # value
                     author,                 # author
+                    author,                 # author_original
                     False,                  # by_processor
                     json.dumps({}),         # metadata
                 )]

From 09f26dc1f91f363482f9f60fdefa5198d1ea8fc8 Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Thu, 22 Aug 2024 20:52:42 +0200
Subject: [PATCH 148/204] Revamp annotation saving from annotations made in
 Explorer

---
 common/lib/annotation.py                      |   2 +-
 webtool/static/js/explorer.js                 | 130 ++++++++----------
 .../templates/explorer/post-annotations.html  |  22 +--
 webtool/views/views_explorer.py               |  10 +-
 4 files changed, 74 insertions(+), 90 deletions(-)

diff --git a/common/lib/annotation.py b/common/lib/annotation.py
index 153f1c6ba..147378c57 100644
--- a/common/lib/annotation.py
+++ b/common/lib/annotation.py
@@ -116,7 +116,7 @@ def __init__(self, data=None, id=None, db=None):
                 "options": data.get("options", ""),
                 "value": data.get("value", ""),
                 "author": data.get("author", ""),
-                "author_original": data.get("author_original", ""),
+                "author_original": data.get("author", ""),
                 "by_processor": data.get("by_processor", False),
                 "metadata": data.get("metadata", {}),
             }
diff --git a/webtool/static/js/explorer.js b/webtool/static/js/explorer.js
index 41dbe26a6..2957d5366 100644
--- a/webtool/static/js/explorer.js
+++ b/webtool/static/js/explorer.js
@@ -100,18 +100,19 @@ const annotations = {
 				annotations.addOptions(e.target);
 			}
 		});
-		
-		// Make saving available when annotations are changed
-		let post_annotations = $(".post-annotations");
-		post_annotations.on("keydown", "input, textarea", function() { edits_made = true;});
-		post_annotations.on("click", "option, input[type=checkbox], label", function() { edits_made = true;});
 
 		// Keep track of whether the annotations are edited or not.
-		post_annotations.on("keydown change",
-							".post-annotation-input, .post-annotation input, .post-annotation textarea",
+		let post_annotations = $(".post-annotations");
+		post_annotations.on("keydown click change",
+							".post-annotation-input, input[type=checkbox], label, option",
 							function(){
-			annotations.markChanges($(this).parent());
-
+			edits_made = true;
+			// Navigate one level up if it's a checkbox input
+			let parent = $(this).parent();
+			if (parent.hasClass("checkboxes")) {
+				parent = parent.parent();
+			}
+			annotations.markChanges(parent);
 		});
 
 		// Save the annotations to the database
@@ -310,71 +311,55 @@ const annotations = {
 		to an annotation object.
 
 		Must be given a .post-annotation div element.
-
 		*/
 
 		let ann_input = el.find(".post-annotation-input");
 		let ann_classes = el.attr("class").split(" ");
-		let ann_input_classes = ann_input.attr("class").split(" ");
-		let field_id = ann_input_classes[1].replace("field-", "");
-		let annotation_type = ann_classes[2].replace("type-", "");
+		let ann_type = ann_classes[2].replace("type-", "");
+		let field_id = ann_classes[1].replace("field-", "");
 		let item_id = ann_classes[3].replace("item-id-", "");
 		let label = el.find(".annotation-label").text();
 		let author = el.find(".annotation-author").html();
+		let options = el.find(".annotation-options").html();
 		let timestamp = parseInt(el.find(".epoch-timestamp-edited").html());
 
 		let val = undefined;
-		let edited = false
-
-		if (annotation_type === "text" || annotation_type === "textarea") {
-			val = ann_input.val();
-			// It can be the case that the input text is deleted
-			// In this case we *do* want to push new data, so we check
-			// whether there's an 'edited' class present and save if so.
-			if (ann_input.hasClass("edited")) {
-				edited = true
+
+		// If there are values inserted or things changed, return an annotation object.
+		// even if the value is an empty string.
+		if (el.hasClass("edited")) {
+			if (ann_type === "text" || ann_type === "textarea") {
+				val = ann_input.val();
+			} else if (ann_type === "dropdown") {
+				val = ann_input.find(".post-annotation-options").val();
+			} else if (ann_type === "checkbox") {
+				val = [];
+				el.find(".post-annotation-input").each(function () {
+					let checkbox = $(this);
+					if (checkbox.prop("checked") === true) {
+						val.push(checkbox.val());
+					}
+				});
 			}
-		}
-		else if (annotation_type === "dropdown") {
-			val = ann_input.find(".post-annotation-options").val();
-		}
-		else if (annotation_type === "checkbox") {
-			val = [];
-			ann_input.find(".post-annotation-options > input").each(function(){
-				if (ann_input.is(":checked")) {
-					val.push(ann_input.val());
-				}
-				if (ann_input.hasClass("edited")) {
-					edited = true
-				}
-			});
-			if (!val.length > 0) {
-				val = undefined;
+
+			// Create an annotation object and add them to the array.
+			let annotation = {
+				"field_id": field_id,
+				"item_id": item_id,
+				"label": label,
+				"type": ann_type,
+				"value": val,
+				"author": author,
+				"by_processor": false, // Explorer annotations are human-made!
+				"timestamp": timestamp,
+				"options": options,
 			}
+			return annotation;
 		}
-
-		// if ((val !== undefined && val !== "") || edited) {
-		// 	vals_changed = true;
-		// 	val = "";
-		// 	console.log("EDITED")
-		// }
-		//
-		// if (vals_changed){
-		// 	annotation[post_id] = post_vals;
-		// }
-
-		// Create an annotation object and add them to the array.
-		let annotation = {
-			"field_id": field_id,
-			"item_id": item_id,
-			"label": label,
-			"type": annotation_type,
-			"value": val,
-			"author": author,
-			"by_processor": false, // Explorer annotations are human-made!
-			"timestamp": timestamp
+		else {
+		// Return an empty object if nothing changed
+			return {};
 		}
-		return annotation
 	},
 
 	applyAnnotationFields: function (e){
@@ -428,8 +413,7 @@ const annotations = {
 			data: JSON.stringify(annotation_fields),
 			success: function (response) {
 				// If the query is accepted by the server...
-
-				//location.reload(); // ...simply reload the page to render the template again
+				location.reload(); // ...simply reload the page to render the template again
 			},
 			error: function (error) {
 				if (error.status == 400) {
@@ -461,8 +445,7 @@ const annotations = {
 					
 					// Extract annotation object from the element
 					let annotation = annotations.parseAnnotation($(this));
-
-					if (annotation) {
+					if (Object.keys(annotation).length > 0 ) {
 						anns.push(annotation);
 					}
 				});
@@ -472,8 +455,6 @@ const annotations = {
 		let save_annotations = $("#save-annotations");
 		save_annotations.html("<i class='fas fa-circle-notch spinner'></i> Saving annotations")
 
-		let code = ""
-
 		$.ajax({
 			url: getRelativeURL("explorer/save_annotations/" + dataset_key),
 			type: "POST",
@@ -481,17 +462,15 @@ const annotations = {
 			data: JSON.stringify(anns),
 
 			success: function (response) {
-
-				if (response === 'success') {
-					code = response;
-				}
-				else {
-					alert("Couldn't save annotations");
-					console.log(response);
-				}
 				save_annotations.html("<i class='fas fa-save'></i> Save annotations");
 			},
 			error: function (error) {
+				if (error.status == 400) {
+					annotations.warnEditor(error.responseJSON.error);
+				}
+				else {
+					annotations.warnEditor("Server error, couldn't save annotation fields.")
+				}
 				save_annotations.html("<i class='fas fa-save'></i> Save annotations");
 				console.log(error)
 			}
@@ -586,8 +565,7 @@ const annotations = {
 		// Currently includes the time of edits and the username of the annotator.
 		let current_username = $("#current-username").html();
 		let current_date = Date.now() / 1000;
-		let input_field = el.find(".post-annotation-input");
-		input_field.addClass("edited");
+		$(el).addClass("edited");
 		$(el).find(".annotation-author").html(current_username);
 		$(el).find(".epoch-timestamp-edited").html(current_date);
 		$(el).find(".timestamp-edited").html(getLocalTimeStr(current_date));
diff --git a/webtool/templates/explorer/post-annotations.html b/webtool/templates/explorer/post-annotations.html
index 842ac3a4a..75cc32c7b 100644
--- a/webtool/templates/explorer/post-annotations.html
+++ b/webtool/templates/explorer/post-annotations.html
@@ -22,13 +22,13 @@
 			<div class="post-annotation field-{{ field }} type-{{ type }} item-id-{{ post.id }}">
                 <label class="annotation-label">{{ label }}</label>
 			{% if type == 'text' %}
-				<input type="text" class="post-annotation-input text-{{ field }}" value="{{ annotation.value }}">
+				<input type="text" class="post-annotation-input" value="{{ annotation.value }}">
 
 			{% elif type == 'textarea' %}
-				<textarea class="post-annotation-input textarea-{{ field }}">{{ annotation.value }}</textarea>
+				<textarea class="post-annotation-input">{{ annotation.value }}</textarea>
 
 			{% elif type == 'dropdown' %}
-				<select class="post-annotation-options select-{{ type }}" id="{{ field }}">
+				<select class="post-annotation-options">
 				<option class='post-annotation-input' value=''></option>
 
 				{% for option_id, option_label in annotation_fields[field]["options"].items() %}
@@ -37,10 +37,9 @@
 				</select>
 
 			{% elif type == 'checkbox' %}
-				<div class='post-annotation-options checkboxes-{{ field }}'>
+				<div class='post-annotation-options checkboxes'>
 				{% for option_id, option_label in annotation_fields[field]["options"].items() %}
 					{% set checked = "checked" if option_label in annotation.value else "" %}
-
 					<input type='checkbox' class='post-annotation-input option-{{ option_id }}' id='option-{{ post.id }}-{{ option_id }}' value='{{ option_label }}' {{ checked }}><label for='option-{{ post.id }}-{{ option_id }}'>{{ option_label }}</label>
 				{% endfor %}
 				</div>
@@ -69,15 +68,16 @@
                         {% endfor %}
                         </span>
                     {% endif %}
-                </p>
                 {% endif %}
+                </p>
             {% endif %}
 
-            {# Store some invisible data here to we can retrieve in with JS #}
-            <div class="annotation-data" style="display: none">
-                <span class="epoch-timestamp-edited">{{ annotation.timestamp }}</span>
-                <span class="annotation-author">{{ annotation.author }}</span>
-            </div>
+                {# Store some invisible data here to we can retrieve in with JS #}
+                <div class="annotation-data" style="display: none;">
+                    <span class="epoch-timestamp-edited">{{ annotation.timestamp }}</span>
+                    <span class="annotation-author">{{ annotation.author }}</span>
+                    <span class="annotation-options">{{ annotation_fields[field].get("options", {}).values() | join(",") }}</span>
+                </div>
 
 			</div>
 		{% endfor %}
diff --git a/webtool/views/views_explorer.py b/webtool/views/views_explorer.py
index 7a6323a4a..9182d0c68 100644
--- a/webtool/views/views_explorer.py
+++ b/webtool/views/views_explorer.py
@@ -212,8 +212,14 @@ def explorer_save_annotations(dataset_key: str):
 	except DataSetException:
 		return error(404, error="Dataset not found.")
 
-	dataset.save_annotations(annotations, overwrite=True)
-	return "success"
+	try:
+		annotations_saved = dataset.save_annotations(annotations, overwrite=True)
+	except AnnotationException as e:
+		# If anything went wrong with the annotation field saving, return an error.
+		return jsonify(error=str(e)), 400
+
+	# Else return the amount of fields saved.
+	return str(annotations_saved)
 
 def sort_and_iterate_items(dataset: DataSet, sort="", reverse=False, **kwargs) -> dict:
 	"""

From 88b760901e023506debae4f33ee64d79e61f5eec Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Fri, 23 Aug 2024 17:54:23 +0200
Subject: [PATCH 149/204] Add saving notice and fix dropdown saving

---
 common/lib/annotation.py                      |   2 +-
 webtool/static/css/dataset-page.css           |  17 ++
 webtool/static/js/explorer.js                 | 194 +++++++++++++-----
 webtool/static/js/fourcat.js                  |   1 -
 webtool/templates/explorer/explorer.html      |   4 +-
 .../templates/explorer/post-annotations.html  |   8 +-
 6 files changed, 163 insertions(+), 63 deletions(-)

diff --git a/common/lib/annotation.py b/common/lib/annotation.py
index 147378c57..3af037a6e 100644
--- a/common/lib/annotation.py
+++ b/common/lib/annotation.py
@@ -306,7 +306,7 @@ def update_annotations_via_fields(dataset_key, old_fields: dict, new_fields: dic
                             fields_to_delete.add(field_id)
                             continue
 
-                        old_options = old_field["options"]
+                        old_options = old_field.get("options", {})
                         options_to_update = {}
 
                         # Options are saved in a dict with IDs as keys and labels as values.
diff --git a/webtool/static/css/dataset-page.css b/webtool/static/css/dataset-page.css
index 61916e156..47585d58f 100644
--- a/webtool/static/css/dataset-page.css
+++ b/webtool/static/css/dataset-page.css
@@ -771,6 +771,23 @@ body.image-preview {
     padding: 0.5em
 }
 
+.post-annotations .property-badge {
+    font-size: 13px;
+}
+
+#save-annotations-notice {
+    position: fixed;
+    background-color: var(--accent-okay);
+    color: var(--contrast-bright);
+    display: none;
+    right: 20px;
+    bottom: 64px;
+    width: 200px;
+    text-align: center;
+    padding: 10px 5px 10px 5px;
+    border-radius: 10px;
+}
+
 .delete-input {
     float: right;
 }
\ No newline at end of file
diff --git a/webtool/static/js/explorer.js b/webtool/static/js/explorer.js
index 2957d5366..69195913a 100644
--- a/webtool/static/js/explorer.js
+++ b/webtool/static/js/explorer.js
@@ -5,6 +5,12 @@ $(init);
 /*
  * Page init
  */
+
+// Global variable to keep track if annotations have been edited.
+var edits_made = false;
+// To check if we have to save annotations when exiting a page; not necessary for refresh.
+var exit_page = true;
+
 function init() {
 
 	// Functional stuff
@@ -20,11 +26,11 @@ function init() {
  */
 const annotations = {
 
+
 	init: function() {
 
 		let editor = $("#annotation-fields-editor");
 		let editor_controls = $("#annotation-fields-editor-controls");
-		var edits_made = false;
 
 		// Add a new annotation field when clicking the plus icon
 		$("#new-annotation-field").on("click", function(){
@@ -107,9 +113,10 @@ const annotations = {
 							".post-annotation-input, input[type=checkbox], label, option",
 							function(){
 			edits_made = true;
-			// Navigate one level up if it's a checkbox input
+
 			let parent = $(this).parent();
-			if (parent.hasClass("checkboxes")) {
+			// Navigate one level up if it's a checkbox or dropdown input
+			if (parent.hasClass("post-annotation-options")) {
 				parent = parent.parent();
 			}
 			annotations.markChanges(parent);
@@ -120,10 +127,12 @@ const annotations = {
 			annotations.saveAnnotations();
 		});
 
-		// Save unsaved annotations upon changing a page.
-		$('.page > a').click(function(){
-			annotations.saveAnnotations();
-		})
+		// Save unsaved annotations upon leaving the page.
+		window.onbeforeunload = function(){
+			if (exit_page) {
+				annotations.saveAnnotations();
+			}
+		};
 
 		// Check whether there's already fields saved for this dataset
 		annotations.fieldsExist();
@@ -131,7 +140,7 @@ const annotations = {
 		// Save annotations every 10 seconds if changes have been made
 		setInterval(function() {
 			if (edits_made) {
-				//annotations.saveAnnotations();
+				annotations.saveAnnotations();
 			}
 		}, 10000);
 
@@ -327,50 +336,47 @@ const annotations = {
 
 		// If there are values inserted or things changed, return an annotation object.
 		// even if the value is an empty string.
-		if (el.hasClass("edited")) {
-			if (ann_type === "text" || ann_type === "textarea") {
-				val = ann_input.val();
-			} else if (ann_type === "dropdown") {
-				val = ann_input.find(".post-annotation-options").val();
-			} else if (ann_type === "checkbox") {
-				val = [];
-				el.find(".post-annotation-input").each(function () {
-					let checkbox = $(this);
-					if (checkbox.prop("checked") === true) {
-						val.push(checkbox.val());
-					}
-				});
-			}
 
-			// Create an annotation object and add them to the array.
-			let annotation = {
-				"field_id": field_id,
-				"item_id": item_id,
-				"label": label,
-				"type": ann_type,
-				"value": val,
-				"author": author,
-				"by_processor": false, // Explorer annotations are human-made!
-				"timestamp": timestamp,
-				"options": options,
-			}
-			return annotation;
+		if (ann_type === "text" || ann_type === "textarea") {
+			val = ann_input.val();
+		} else if (ann_type === "dropdown") {
+			val = $(ann_input).find(":selected").val();
+		} else if (ann_type === "checkbox") {
+			val = [];
+			el.find(".post-annotation-input").each(function () {
+				let checkbox = $(this);
+				if (checkbox.prop("checked") === true) {
+					val.push(checkbox.val());
+				}
+			});
 		}
-		else {
-		// Return an empty object if nothing changed
-			return {};
+
+		// Create an annotation object and add them to the array.
+		let annotation = {
+			"field_id": field_id,
+			"item_id": item_id,
+			"label": label,
+			"type": ann_type,
+			"value": val,
+			"author": author,
+			"by_processor": false, // Explorer annotations are human-made!
+			"timestamp": timestamp,
+			"options": options,
 		}
+		//console.log(annotation)
+		return annotation;
 	},
 
 	applyAnnotationFields: function (e){
 		// Applies the annotation fields to each post on this page.
 
 		// First we collect the annotation information from the editor
-		let annotation_fields = annotations.parseAnnotationFields(e);
+
+		let new_annotation_fields = annotations.parseAnnotationFields(e);
 
 		// Show an error message if the annotation fields were not valid.
-		if (typeof annotation_fields == "string") {
-			annotations.warnEditor(annotation_fields);
+		if (typeof new_annotation_fields == "string") {
+			annotations.warnEditor(new_annotation_fields);
 		}
 
 		// If everything is ok, we're going to add
@@ -389,33 +395,42 @@ const annotations = {
 			});
 
 			// We store the annotation fields in the dataset table.
-			annotations.saveAnnotationFields(annotation_fields);
+			// First check if existing annotations are affected.
+			if (annotation_fields) {
+				annotations.checkFieldChanges(new_annotation_fields, annotation_fields);
+			}
+			else {
+				annotations.saveAnnotationFields(new_annotation_fields);
+			}
 		}
 	},
 
-	saveAnnotationFields: function (annotation_fields){
+	saveAnnotationFields: function (new_fields){
 		// Save the annotation fields used for this dataset
 		// to the datasets table.
+		// `old fields` can be given to warn the user if changes to existing fields
+		// will affect annotations, like deleting a field or changing its type.
+
+		let dataset_key = $("#dataset-key").text();
 
-		if (annotation_fields.length < 1) {
+		if (new_fields.length < 1) {
 			return;
 		}
 
-		// If there's annotation fields, we can enable/disable the buttons
-		annotations.fieldsExist();
-
-		let dataset_key = $("#dataset-key").text();
 		// AJAX the annotation forms
 		$.ajax({
 			url: getRelativeURL("explorer/save_annotation_fields/" + dataset_key),
 			type: "POST",
 			contentType: "application/json",
-			data: JSON.stringify(annotation_fields),
-			success: function (response) {
+			data: JSON.stringify(new_fields),
+			success: function () {
 				// If the query is accepted by the server...
+				exit_page = false;
 				location.reload(); // ...simply reload the page to render the template again
 			},
 			error: function (error) {
+				console.log(error);
+
 				if (error.status == 400) {
 					annotations.warnEditor(error.responseJSON.error);
 				}
@@ -427,6 +442,62 @@ const annotations = {
 		});
 	},
 
+	checkFieldChanges(new_fields, old_fields) {
+
+		let deleted_fields = [];
+		let changed_type_fields = [];
+
+		// Warn the user in case fields are deleted or changed from text to choice.
+		if (old_fields) {
+			let text_fields = ["text", "textarea"];
+			let choice_fields = ["checkbox", "dropdown"];
+
+			for (let old_field_id in old_fields) {
+
+				// Deleted
+				if (!(old_field_id in new_fields) || !new_fields) {
+					deleted_fields.push(old_fields[old_field_id]["label"]);
+				} else {
+					let old_type = old_fields[old_field_id]["type"];
+					let new_type = new_fields[old_field_id]["type"]
+					if (old_type !== new_type) {
+						// Changed from text to choice, or the reverse.
+						// In this case annotations will be deleted.
+						// Changes from dropdown to checkbox also result in deleted annotations.
+						if ((text_fields.includes(old_type) && choice_fields.includes(new_type)) ||
+							(choice_fields.includes(old_type) && text_fields.includes(new_type)) ||
+							(choice_fields.includes(old_type) && choice_fields.includes(new_type))) {
+							changed_type_fields.push(new_type);
+						}
+					}
+				}
+			}
+		}
+
+		// Ask 4 confirmation
+		if (deleted_fields.length > 0 || changed_type_fields.length > 0) {
+			let msg = "";
+			if (deleted_fields.length > 0 && changed_type_fields.length > 0) {
+				msg = `Deleting fields and changing field types will also delete existing annotations that belonged to them.
+						Do you want to continue?`;
+			}
+			else if (changed_type_fields.length > 0) {
+				msg = `Changing field types will also delete existing annotations that belonged to them.
+						Do you want to continue?`;
+			}
+			else if (deleted_fields.length > 0) {
+				msg = `Deleting fields will also delete existing annotations that belonged to them. 
+						Do you want to continue?`;
+			}
+			popup.confirm(msg, "Confirm", () => {
+				annotations.saveAnnotationFields(new_fields);
+			});
+		}
+		else {
+			annotations.saveAnnotationFields(new_fields);
+		}
+	},
+
 	saveAnnotations: function (){
 		// Write the annotations to the dataset and annotations table.
 
@@ -443,10 +514,12 @@ const annotations = {
 
 				post_annotations.find(".post-annotation").each(function(){
 					
-					// Extract annotation object from the element
-					let annotation = annotations.parseAnnotation($(this));
-					if (Object.keys(annotation).length > 0 ) {
-						anns.push(annotation);
+					// Extract annotation object from edited elements
+					if ($(this).hasClass("edited")) {
+						let annotation = annotations.parseAnnotation($(this));
+						if (Object.keys(annotation).length > 0 ) {
+							anns.push(annotation);
+						}
 					}
 				});
 			}
@@ -463,16 +536,18 @@ const annotations = {
 
 			success: function (response) {
 				save_annotations.html("<i class='fas fa-save'></i> Save annotations");
+				annotations.notifySaved();
+				edits_made = false;
 			},
 			error: function (error) {
+				console.log(error)
 				if (error.status == 400) {
 					annotations.warnEditor(error.responseJSON.error);
 				}
 				else {
-					annotations.warnEditor("Server error, couldn't save annotation fields.")
+					annotations.warnEditor("Server error, couldn't save annotations.")
 				}
 				save_annotations.html("<i class='fas fa-save'></i> Save annotations");
-				console.log(error)
 			}
 		});
 	},
@@ -504,6 +579,13 @@ const annotations = {
 		}
 	},
 
+	notifySaved: function() {
+		// Flash a fixed div with the notice that annotations are saved.
+		let notice = $("#save-annotations-notice");
+		notice.fadeIn(400);
+		notice.delay(1750).fadeOut(1000);
+	},
+
 	showAnnotations: function() {
 		let ta = $("#toggle-annotations");
 		ta.addClass("shown");
diff --git a/webtool/static/js/fourcat.js b/webtool/static/js/fourcat.js
index e36793c8b..7182ff1d5 100644
--- a/webtool/static/js/fourcat.js
+++ b/webtool/static/js/fourcat.js
@@ -1637,7 +1637,6 @@ const ui_helpers = {
         }
     },
 
-
     /**
      * Ask for confirmation before doing whatever happens when the event goes through
      *
diff --git a/webtool/templates/explorer/explorer.html b/webtool/templates/explorer/explorer.html
index 7301190f4..a19aa23c4 100644
--- a/webtool/templates/explorer/explorer.html
+++ b/webtool/templates/explorer/explorer.html
@@ -12,7 +12,7 @@
 <script type="text/javascript">
 
 // Annotation fields
-var annotation_fields = {% if annotation_fields %}{{ annotation_fields | safe }}{% else %}""{% endif %}
+var annotation_fields = {% if annotation_fields %}{{ annotation_fields | safe }}{% else %}""{% endif %};
 
 // We're getting the dataset column names to make sure that
 // new annotation fields cannot overwrite these.
@@ -65,6 +65,8 @@
 	</div>
 </div>
 
+
 {% include "explorer/pagination.html" %}
 
+<div id="save-annotations-notice">Annotations saved</div>
 {% endblock %}
\ No newline at end of file
diff --git a/webtool/templates/explorer/post-annotations.html b/webtool/templates/explorer/post-annotations.html
index 75cc32c7b..cbbd2e6b8 100644
--- a/webtool/templates/explorer/post-annotations.html
+++ b/webtool/templates/explorer/post-annotations.html
@@ -28,16 +28,16 @@
 				<textarea class="post-annotation-input">{{ annotation.value }}</textarea>
 
 			{% elif type == 'dropdown' %}
-				<select class="post-annotation-options">
-				<option class='post-annotation-input' value=''></option>
+				<select class="post-annotation-input">
+				    <option class='post-annotation-option' value=''></option>
 
 				{% for option_id, option_label in annotation_fields[field]["options"].items() %}
-					<option class="post-annotation-input option-{{ option_id }}" id="option-{{ option_id }}" value="{{ option_label }}" {% if option_label == annotation.value %}selected{% endif %}>{{ option_label }}</option>
+					<option class="post-annotation-option option-{{ option_id }}" id="option-{{ option_id }}" value="{{ option_label }}" {% if option_label == annotation.value %}selected{% endif %}>{{ option_label }}</option>
 				{% endfor %}
 				</select>
 
 			{% elif type == 'checkbox' %}
-				<div class='post-annotation-options checkboxes'>
+				<div class='post-annotation-options'>
 				{% for option_id, option_label in annotation_fields[field]["options"].items() %}
 					{% set checked = "checked" if option_label in annotation.value else "" %}
 					<input type='checkbox' class='post-annotation-input option-{{ option_id }}' id='option-{{ post.id }}-{{ option_id }}' value='{{ option_label }}' {{ checked }}><label for='option-{{ post.id }}-{{ option_id }}'>{{ option_label }}</label>

From 9e06bf6b2bdf27630f1ea22df232685b77d72b80 Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Mon, 26 Aug 2024 11:40:34 +0200
Subject: [PATCH 150/204] (almost) fix options saving

---
 common/lib/annotation.py | 57 +++++++++++++++++++++++++++++-----------
 1 file changed, 42 insertions(+), 15 deletions(-)

diff --git a/common/lib/annotation.py b/common/lib/annotation.py
index 3af037a6e..32093caf8 100644
--- a/common/lib/annotation.py
+++ b/common/lib/annotation.py
@@ -165,7 +165,10 @@ def get_by_id(self, id: int):
         if not data:
             return {}
 
+        if data["type"] in ["dropdown", "checkbox"]:
+            data["value"] = data["value"].split(",")
         data["metadata"] = json.loads(data["metadata"])
+
         return data
 
     def get_by_field(self, dataset_key: str, item_id: str, label: str) -> dict:
@@ -185,7 +188,10 @@ def get_by_field(self, dataset_key: str, item_id: str, label: str) -> dict:
         if not data:
             return {}
 
+        if data["type"] in ["dropdown", "checkbox"]:
+            data["value"] = data["value"].split(",")
         data["metadata"] = json.loads(data["metadata"])
+
         return data
 
     def set_field_id(self, dataset_key: str, label: str) -> str:
@@ -207,9 +213,13 @@ def write_to_db(self):
         Write an annotation to the database.
         """
         db_data = self.data
+
         db_data["timestamp"] = int(time.time())
         m = db_data["metadata"] # To avoid circular reference error
         db_data["metadata"] = json.dumps(m)
+        if db_data["type"] in ["checkbox", "dropdown"]:
+            db_data["value"] = ",".join(db_data["value"])
+
         return self.db.upsert("annotations", data=db_data, constraints=["label", "dataset", "item_id"])
 
     def delete(self):
@@ -341,11 +351,12 @@ def update_annotations_via_fields(dataset_key, old_fields: dict, new_fields: dic
                 # Write to db
                 for column, update_value in updates.items():
 
+                    update_value_insert = update_value
                     if column == "options":
-                        update_value = json.dumps(update_value)
+                        update_value_insert = json.dumps(update_value)
 
                     # Change values of columns
-                    updates = db.update("annotations", {column: update_value},
+                    updates = db.update("annotations", {column: update_value_insert},
                                         where={"dataset": dataset_key, "field_id": field_id})
                     count += updates
 
@@ -353,33 +364,49 @@ def update_annotations_via_fields(dataset_key, old_fields: dict, new_fields: dic
                     # Here we have to also rename/remove inserted options from the values column.
                     if column == "options":
 
-                        inserted_options = db.fetchall("SELECT id, value FROM annotations "
+                        inserted_options = db.fetchall("SELECT id, options, value FROM annotations "
                                                       "WHERE dataset = '%s' and field_id = '%s'" % (dataset_key, field_id))
                         new_inserts = []
                         for inserted_option in inserted_options:
 
                             annotation_id = inserted_option["id"]
-                            inserted_option = inserted_option["value"]
+                            annotation_values = inserted_option["value"]
+                            annotation_options = inserted_option["options"].split(",")
 
-                            if not inserted_option:
-                                continue
+                            # CHange the options
+
+                            if not annotation_values:
+                                 continue
+
+                            annotation_values = annotation_values.split(",")
 
                             # Remove or rename options
                             new_values = []
-                            for inserted_option in inserted_options:
-                                if inserted_option in update_value:
-                                    if update_value[inserted_option] == None:
+                            new_options = update_value["options"]
+
+                            for annotation_value in annotation_values:
+                                if annotation_value in new_options:
+                                    if new_options[annotation_value] == None:
                                         # Don't add
                                         continue
-                                    elif inserted_option in update_value:
+                                    elif annotation_value in new_options:
                                         # Replace with new value
-                                        new_values.append(update_value[inserted_option])
-                                    else:
-                                        # Keep old value
-                                        new_values.append(inserted_option)
+                                        new_values.append(new_options[annotation_value])
+                                else:
+                                    # Keep old value
+                                    new_values.append(annotation_value)
+
+                            # Update the options column as well
+                            new_annotation_options = []
+                            for annotation_option in annotation_options:
+                                if annotation_option in new_options:
+                                    new_annotation_options.append(new_options[annotation_option])
+                                else:
+                                    new_annotation_options.append(annotation_option)
 
                             new_values = ",".join(new_values)
-                            db.update("annotations", {"value": new_values}, where={"id": annotation_id})
+
+                            db.update("annotations", {"value": ",".join(new_values), "options": ",".join(new_annotation_options)}, where={"id": annotation_id})
 
         return count
 

From 33b527f402413fe77597c1c6646ce0bd2fa9c31a Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Mon, 26 Aug 2024 16:08:07 +0200
Subject: [PATCH 151/204] LinkedIn template: Don't make the play button icon as
 big as the post itself

---
 webtool/static/css/explorer/linkedin.css | 2 --
 1 file changed, 2 deletions(-)

diff --git a/webtool/static/css/explorer/linkedin.css b/webtool/static/css/explorer/linkedin.css
index a71adb17a..88ee48dfc 100644
--- a/webtool/static/css/explorer/linkedin.css
+++ b/webtool/static/css/explorer/linkedin.css
@@ -96,8 +96,6 @@ header .author .author-description, header .author .time, .metrics span {
 }
 
 .play-button {
-	width: 100%;
-	height: 100%;
 	position: absolute;
 	top: 50%;
     left: 45%;

From 50b53ef442e24277a4787b418715ce963069d79e Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Mon, 26 Aug 2024 16:08:17 +0200
Subject: [PATCH 152/204] Fix spinner

---
 webtool/static/css/dataset-page.css | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/webtool/static/css/dataset-page.css b/webtool/static/css/dataset-page.css
index 47585d58f..227be5b80 100644
--- a/webtool/static/css/dataset-page.css
+++ b/webtool/static/css/dataset-page.css
@@ -790,4 +790,11 @@ body.image-preview {
 
 .delete-input {
     float: right;
+}
+
+.spinner {
+    animation: fa-spin;
+    animation-duration: 1000ms;
+    animation-iteration-count: infinite;
+    animation-timing-function: linear;
 }
\ No newline at end of file

From e1b83b78fe198305528dd3c02cbc369740b49f3b Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Mon, 26 Aug 2024 16:09:46 +0200
Subject: [PATCH 153/204] Don't contain tag drags in config

---
 webtool/static/js/explorer.js | 68 ++++++++++++-----------------------
 1 file changed, 23 insertions(+), 45 deletions(-)

diff --git a/webtool/static/js/explorer.js b/webtool/static/js/explorer.js
index 69195913a..bd44d6002 100644
--- a/webtool/static/js/explorer.js
+++ b/webtool/static/js/explorer.js
@@ -6,10 +6,8 @@ $(init);
  * Page init
  */
 
-// Global variable to keep track if annotations have been edited.
-var edits_made = false;
-// To check if we have to save annotations when exiting a page; not necessary for refresh.
-var exit_page = true;
+// Timer variable to start/reset saving annotations.
+var save_timer = null;
 
 function init() {
 
@@ -57,16 +55,6 @@ const annotations = {
 			}
 		});
 
-		// Keep track of when the annotation fields were edited.
-		editor_controls.on("click", "#apply-annotation-fields, .delete-input, .delete-option-field", function() {
-			edits_made = true;
-			annotations.enableSaving();
-		});
-		editor_controls.on("change keydown", "input, select", function() {
-			edits_made = true;
-			annotations.enableSaving();
-		});	
-
 		// Show and hide annotations
 		$("#toggle-annotations").on("click", function(){
 			if ($(this).hasClass("shown")) {
@@ -112,7 +100,6 @@ const annotations = {
 		post_annotations.on("keydown click change",
 							".post-annotation-input, input[type=checkbox], label, option",
 							function(){
-			edits_made = true;
 
 			let parent = $(this).parent();
 			// Navigate one level up if it's a checkbox or dropdown input
@@ -120,30 +107,19 @@ const annotations = {
 				parent = parent.parent();
 			}
 			annotations.markChanges(parent);
+			annotations.startSaveTimer();
 		});
 
 		// Save the annotations to the database
 		$("#save-annotations").on("click", function(){
+			clearTimeout(save_timer);
+			save_timer = null;
 			annotations.saveAnnotations();
 		});
 
-		// Save unsaved annotations upon leaving the page.
-		window.onbeforeunload = function(){
-			if (exit_page) {
-				annotations.saveAnnotations();
-			}
-		};
-
 		// Check whether there's already fields saved for this dataset
 		annotations.fieldsExist();
 
-		// Save annotations every 10 seconds if changes have been made
-		setInterval(function() {
-			if (edits_made) {
-				annotations.saveAnnotations();
-			}
-		}, 10000);
-
 	},
 
 	toggleField: function (el) {
@@ -363,7 +339,6 @@ const annotations = {
 			"timestamp": timestamp,
 			"options": options,
 		}
-		//console.log(annotation)
 		return annotation;
 	},
 
@@ -424,9 +399,9 @@ const annotations = {
 			contentType: "application/json",
 			data: JSON.stringify(new_fields),
 			success: function () {
-				// If the query is accepted by the server...
-				exit_page = false;
-				location.reload(); // ...simply reload the page to render the template again
+				// If the query is accepted by the server
+				// simply reload the page to render the template again
+				window.location.replace(window.location.href);
 			},
 			error: function (error) {
 				console.log(error);
@@ -537,7 +512,6 @@ const annotations = {
 			success: function (response) {
 				save_annotations.html("<i class='fas fa-save'></i> Save annotations");
 				annotations.notifySaved();
-				edits_made = false;
 			},
 			error: function (error) {
 				console.log(error)
@@ -559,14 +533,17 @@ const annotations = {
 		return Object.keys(annotation_fields).length >= 1;
 	},
 
-	enableSaving: function(){
-		// Enable saving annotations to the database
-		$("#save-annotations, #save-to-dataset").removeClass("disabled");
-	},
-
-	disableSaving: function(){
-		// Disable saving annotations to the database
-		$("#save-annotations, #save-to-dataset").addClass("disabled");
+	// Save annotations after x seconds if changes have been made
+	startSaveTimer: function() {
+		// Reset the save timer if it was already ongoing,
+		// so we're not making unnecessary calls when edits are still being made.
+		if (save_timer){
+			clearTimeout(save_timer);
+			save_timer = null;
+		}
+		save_timer = setTimeout(function() {
+			annotations.saveAnnotations();
+		}, 3000);
 	},
 
 	warnEditor: function(warning) {
@@ -582,8 +559,10 @@ const annotations = {
 	notifySaved: function() {
 		// Flash a fixed div with the notice that annotations are saved.
 		let notice = $("#save-annotations-notice");
-		notice.fadeIn(400);
-		notice.delay(1750).fadeOut(1000);
+		if (!notice.is(":visible")) {
+			notice.fadeIn(300);
+			notice.delay(1500).fadeOut(1000);
+		}
 	},
 
 	showAnnotations: function() {
@@ -675,7 +654,6 @@ const page_functions = {
             handle: ".handle",
             items: "li",
             axis: "y",
-			containment: "#annotation-field-settings",
 			change: function() {
 
 			}

From b4c9437a0e582928b8ac768c12d070d5a5c1c7ee Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Mon, 26 Aug 2024 16:10:08 +0200
Subject: [PATCH 154/204] Ibid

---
 webtool/static/js/fourcat.js | 1 -
 1 file changed, 1 deletion(-)

diff --git a/webtool/static/js/fourcat.js b/webtool/static/js/fourcat.js
index 7182ff1d5..3dd76ef9d 100644
--- a/webtool/static/js/fourcat.js
+++ b/webtool/static/js/fourcat.js
@@ -1555,7 +1555,6 @@ const ui_helpers = {
             cursor: 'ns-resize',
             handle: '.handle',
             items: '.implicit, .explicit',
-            containment: '#tag-order',
             axis: 'y',
             update: function(e, ui) {
                 let tag_order = Array.from(document.querySelectorAll('#tag-order li[data-tag]')).map(t => t.getAttribute('data-tag')).join(',');

From 1d35442833433c878da1ba4f7df74f2eadab90f9 Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Mon, 26 Aug 2024 16:10:34 +0200
Subject: [PATCH 155/204] Simplify and fix option field saving, let changes
 also affect annotations

---
 common/lib/annotation.py                      | 93 ++++++++-----------
 .../explorer/annotations-editor.html          |  2 +-
 .../templates/explorer/post-annotations.html  | 16 ++--
 3 files changed, 50 insertions(+), 61 deletions(-)

diff --git a/common/lib/annotation.py b/common/lib/annotation.py
index 32093caf8..ccb8a5ff6 100644
--- a/common/lib/annotation.py
+++ b/common/lib/annotation.py
@@ -9,6 +9,7 @@
 from common.lib.helpers import hash_values
 from common.lib.exceptions import AnnotationException
 
+
 class Annotation:
     """
     Annotation class
@@ -165,7 +166,7 @@ def get_by_id(self, id: int):
         if not data:
             return {}
 
-        if data["type"] in ["dropdown", "checkbox"]:
+        if data["type"] == "checkbox":
             data["value"] = data["value"].split(",")
         data["metadata"] = json.loads(data["metadata"])
 
@@ -188,7 +189,7 @@ def get_by_field(self, dataset_key: str, item_id: str, label: str) -> dict:
         if not data:
             return {}
 
-        if data["type"] in ["dropdown", "checkbox"]:
+        if data["type"] == "checkbox":
             data["value"] = data["value"].split(",")
         data["metadata"] = json.loads(data["metadata"])
 
@@ -217,7 +218,7 @@ def write_to_db(self):
         db_data["timestamp"] = int(time.time())
         m = db_data["metadata"] # To avoid circular reference error
         db_data["metadata"] = json.dumps(m)
-        if db_data["type"] in ["checkbox", "dropdown"]:
+        if db_data["type"] == "checkbox":
             db_data["value"] = ",".join(db_data["value"])
 
         return self.db.upsert("annotations", data=db_data, constraints=["label", "dataset", "item_id"])
@@ -277,6 +278,7 @@ def update_annotations_via_fields(dataset_key, old_fields: dict, new_fields: dic
 
         fields_to_delete = set()        # Delete all annotations with this field ID
         fields_to_update = {}           # Update values of annotations with this field ID
+        old_options = {}
 
         # Loop through the old annotation fields
         for old_field_id, old_field in old_fields.items():
@@ -316,18 +318,21 @@ def update_annotations_via_fields(dataset_key, old_fields: dict, new_fields: dic
                             fields_to_delete.add(field_id)
                             continue
 
-                        old_options = old_field.get("options", {})
+                        # Changed options values (e.g. renamed or one field deleted)
+                        old_options[old_field_id] = old_field.get("options", {})
                         options_to_update = {}
-
-                        # Options are saved in a dict with IDs as keys and labels as values.
-                        for old_option_id, old_option in old_options.items():
-
-                            # Renamed option label
-                            if old_option_id in new_options and old_option != new_options[old_option_id]:
-                                options_to_update[old_option] = new_options[old_option_id]  # Old label -> new label
-                            # Deleted option
-                            elif old_option_id not in new_options:
-                                options_to_update[old_option] = None  # Remove None labels later
+                        if old_options[old_field_id] and old_options != new_options:
+                            options_to_update = new_options
+
+                        # # Options are saved in a dict with IDs as keys and labels as values.
+                        # for old_option_id, old_option in old_options.items():
+                        #
+                        #     # Renamed option label
+                        #     if old_option_id in new_options and old_option != new_options[old_option_id]:
+                        #         options_to_update[old_option] = new_options[old_option_id]  # Old label -> new label
+                        #     # Deleted option
+                        #     elif old_option_id not in new_options:
+                        #         options_to_update[old_option] = None  # Remove None labels later
 
                         if options_to_update:
                             update_data[field_key] = {"options": options_to_update}
@@ -353,60 +358,42 @@ def update_annotations_via_fields(dataset_key, old_fields: dict, new_fields: dic
 
                     update_value_insert = update_value
                     if column == "options":
-                        update_value_insert = json.dumps(update_value)
+                        update_value_insert = ",".join(list(update_value["options"].values()))
 
                     # Change values of columns
                     updates = db.update("annotations", {column: update_value_insert},
                                         where={"dataset": dataset_key, "field_id": field_id})
                     count += updates
 
-                    # Special case: Changed options.
-                    # Here we have to also rename/remove inserted options from the values column.
+                    # Special case: Changed option labels.
+                    # Here we have to also rename/remove inserted options from the `value` column.
                     if column == "options":
 
-                        inserted_options = db.fetchall("SELECT id, options, value FROM annotations "
-                                                      "WHERE dataset = '%s' and field_id = '%s'" % (dataset_key, field_id))
-                        new_inserts = []
-                        for inserted_option in inserted_options:
-
-                            annotation_id = inserted_option["id"]
-                            annotation_values = inserted_option["value"]
-                            annotation_options = inserted_option["options"].split(",")
+                        annotations = db.fetchall("SELECT id, options, value FROM annotations "
+                                                      "WHERE dataset = '%s' and field_id = '%s' AND value != '';"
+                                                       % (dataset_key, field_id))
 
-                            # CHange the options
-
-                            if not annotation_values:
-                                 continue
-
-                            annotation_values = annotation_values.split(",")
+                        for annotation in annotations:
+                            annotation_id = annotation["id"]
+                            annotation_values = annotation["value"].split(",")
 
                             # Remove or rename options
                             new_values = []
-                            new_options = update_value["options"]
-
-                            for annotation_value in annotation_values:
-                                if annotation_value in new_options:
-                                    if new_options[annotation_value] == None:
-                                        # Don't add
-                                        continue
-                                    elif annotation_value in new_options:
-                                        # Replace with new value
-                                        new_values.append(new_options[annotation_value])
+                            new_options = update_value["options"] # Dict with option id->label as items
+
+                            for ann_value in annotation_values:
+                                # Get the option ID, so we can see if it's new, deleted, or renamed.
+                                # Should always be present in old options dict
+                                option_id = [k for k, v in old_options[field_id].items() if v == ann_value][0]
+                                # Deleted...
+                                if option_id not in new_options:
+                                    continue
+                                # Or replaced with a new, possibly renamed value
                                 else:
-                                    # Keep old value
-                                    new_values.append(annotation_value)
-
-                            # Update the options column as well
-                            new_annotation_options = []
-                            for annotation_option in annotation_options:
-                                if annotation_option in new_options:
-                                    new_annotation_options.append(new_options[annotation_option])
-                                else:
-                                    new_annotation_options.append(annotation_option)
+                                    new_values.append(new_options[option_id])
 
                             new_values = ",".join(new_values)
-
-                            db.update("annotations", {"value": ",".join(new_values), "options": ",".join(new_annotation_options)}, where={"id": annotation_id})
+                            db.update("annotations", {"value": new_values}, where={"id": annotation_id})
 
         return count
 
diff --git a/webtool/templates/explorer/annotations-editor.html b/webtool/templates/explorer/annotations-editor.html
index b03d62be8..224469705 100644
--- a/webtool/templates/explorer/annotations-editor.html
+++ b/webtool/templates/explorer/annotations-editor.html
@@ -32,7 +32,7 @@
                     </span>
                 {% endfor %}
                 <span class="option-field">
-                    <input type="text" id="option-{{ range(1, 100000000) | random }}" placeholder="Value">
+                    <input type="text" id="option-{{ range(1, 100000000) | random }}" placeholder="Value" value="">
                 </span>
             {% endif %}
             </span>
diff --git a/webtool/templates/explorer/post-annotations.html b/webtool/templates/explorer/post-annotations.html
index cbbd2e6b8..845a1f817 100644
--- a/webtool/templates/explorer/post-annotations.html
+++ b/webtool/templates/explorer/post-annotations.html
@@ -19,28 +19,30 @@
     			{% endif %}
             {% endfor %}
             {% set annotation = an.an %}
+
 			<div class="post-annotation field-{{ field }} type-{{ type }} item-id-{{ post.id }}">
                 <label class="annotation-label">{{ label }}</label>
-			{% if type == 'text' %}
+			{% if type == "text" %}
 				<input type="text" class="post-annotation-input" value="{{ annotation.value }}">
 
-			{% elif type == 'textarea' %}
+			{% elif type == "textarea" %}
 				<textarea class="post-annotation-input">{{ annotation.value }}</textarea>
 
-			{% elif type == 'dropdown' %}
+			{% elif type == "dropdown" %}
 				<select class="post-annotation-input">
-				    <option class='post-annotation-option' value=''></option>
+				    <option class="post-annotation-option" value=""></option>
 
 				{% for option_id, option_label in annotation_fields[field]["options"].items() %}
+
 					<option class="post-annotation-option option-{{ option_id }}" id="option-{{ option_id }}" value="{{ option_label }}" {% if option_label == annotation.value %}selected{% endif %}>{{ option_label }}</option>
 				{% endfor %}
 				</select>
 
-			{% elif type == 'checkbox' %}
-				<div class='post-annotation-options'>
+			{% elif type == "checkbox" %}
+				<div class="post-annotation-options">
 				{% for option_id, option_label in annotation_fields[field]["options"].items() %}
 					{% set checked = "checked" if option_label in annotation.value else "" %}
-					<input type='checkbox' class='post-annotation-input option-{{ option_id }}' id='option-{{ post.id }}-{{ option_id }}' value='{{ option_label }}' {{ checked }}><label for='option-{{ post.id }}-{{ option_id }}'>{{ option_label }}</label>
+					<input type="checkbox" class="post-annotation-input option-{{ option_id }}" id="option-{{ post.id }}-{{ option_id }}" value="{{ option_label }}" {{ checked }}><label for="option-{{ post.id }}-{{ option_id }}">{{ option_label }}</label>
 				{% endfor %}
 				</div>
 			{% endif %}

From 9cfc4e533b793d9be9673ea6f00d789a6e1264e0 Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Mon, 26 Aug 2024 16:57:18 +0200
Subject: [PATCH 156/204] Save state of shoing and hiding annotations in URL
 params

---
 webtool/static/js/explorer.js | 39 ++++++++++++++++++++++++++++-------
 1 file changed, 31 insertions(+), 8 deletions(-)

diff --git a/webtool/static/js/explorer.js b/webtool/static/js/explorer.js
index bd44d6002..c40e7f3ed 100644
--- a/webtool/static/js/explorer.js
+++ b/webtool/static/js/explorer.js
@@ -401,6 +401,7 @@ const annotations = {
 			success: function () {
 				// If the query is accepted by the server
 				// simply reload the page to render the template again
+
 				window.location.replace(window.location.href);
 			},
 			error: function (error) {
@@ -566,6 +567,12 @@ const annotations = {
 	},
 
 	showAnnotations: function() {
+
+		// Store state in URL params
+		let queryParams = new URLSearchParams(window.location.search);
+		queryParams.set("show", "true");
+		history.replaceState(null, null, "?"+queryParams.toString());
+
 		let ta = $("#toggle-annotations");
 		ta.addClass("shown");
 		ta.html("<i class='fas fa-eye-slash'></i> Hide annotations");
@@ -579,6 +586,12 @@ const annotations = {
 	},
 
 	hideAnnotations: function() {
+
+		// Store state in URL params
+		let queryParams = new URLSearchParams(window.location.search);
+		queryParams.delete("show");
+		history.replaceState(null, null, "?"+queryParams.toString());
+
 		let ta = $("#toggle-annotations");
 		ta.removeClass("shown");
 		ta.html("<i class='fas fa-eye'></i> Show annotations");
@@ -662,21 +675,31 @@ const page_functions = {
 		// Reorder the dataset when the sort type is changed
 		$(".sort-select").on("change", function(){
 
-			// Get the column to sort on, an whether we should sort in reverse.
+			// Get the column to sort on, and whether we should sort in reverse.
 			let selected = $("#column-sort-select").find("option:selected").val();
 			let order = $("#column-sort-order").find("option:selected").val();
-
-			sort_order = ""
+			
+			let queryParams = new URLSearchParams(window.location.search);
+			let dataset_key = $("#dataset-key").text();
+			queryParams.set("sort", selected)
 			if (order === "reverse"){
-				sort_order = "&order=reverse"
+				queryParams.set("order", "reverse");
 			}
-
-			let dataset_key = $("#dataset-key").text();
-			window.location.href = getRelativeURL("results/" + dataset_key + "/explorer/?sort=" + selected + sort_order);
+			else {
+				queryParams.delete("order");
+			}
+			window.location.href = getRelativeURL("results/" + dataset_key + "/explorer/?" + queryParams.toString());
 		});
 
-		// Change the dropdown sort option based on the URL parameter
+		// Show annotations if it's in the URL params,
+		// and change the dropdown sort option based on the sort parameter.
 		let searchParams = new URLSearchParams(window.location.search)
+		let show_annotations = searchParams.get("show");
+		console.log(show_annotations);
+		if (show_annotations) {
+			annotations.showAnnotations();
+		}
+
 		let selected = searchParams.get("sort");
 		let sort_order = searchParams.get("order");
 		$("#column-sort-select").find("option[value='" + selected + "']").attr("selected", "selected");

From cd4ff4b894f6a256cff6459613f1227879073c04 Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Mon, 26 Aug 2024 17:04:10 +0200
Subject: [PATCH 157/204] Don't let commas mess with options

---
 webtool/static/js/explorer.js | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/webtool/static/js/explorer.js b/webtool/static/js/explorer.js
index c40e7f3ed..29577ff69 100644
--- a/webtool/static/js/explorer.js
+++ b/webtool/static/js/explorer.js
@@ -251,7 +251,7 @@ const annotations = {
 
 				option_fields.find(".option-field").each(function(){
 					let option_input = $(this).find("input");
-					let option_label = option_input.val();
+					let option_label = option_input.val().replaceAll(",", ""); // No commas allowed
 					let option_id = option_input.attr("id").replace("option-", "");
 
 					// New option label
@@ -678,7 +678,7 @@ const page_functions = {
 			// Get the column to sort on, and whether we should sort in reverse.
 			let selected = $("#column-sort-select").find("option:selected").val();
 			let order = $("#column-sort-order").find("option:selected").val();
-			
+
 			let queryParams = new URLSearchParams(window.location.search);
 			let dataset_key = $("#dataset-key").text();
 			queryParams.set("sort", selected)

From b9382b8051397d657c6c294a96b5e8a7bca09b36 Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Mon, 26 Aug 2024 17:19:35 +0200
Subject: [PATCH 158/204] Return float in a float conversion function :)

---
 common/lib/helpers.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/common/lib/helpers.py b/common/lib/helpers.py
index 639d93df6..847e2e25b 100644
--- a/common/lib/helpers.py
+++ b/common/lib/helpers.py
@@ -246,7 +246,7 @@ def convert_to_int(value, default=0):
     except (ValueError, TypeError):
         return default
 
-def convert_to_float(value, default=0):
+def convert_to_float(value, default=0) -> float:
     """
     Convert a value to a floating point, with a fallback
 
@@ -256,7 +256,7 @@ def convert_to_float(value, default=0):
 
     :param value:  Value to convert
     :param int default:  Default value, if conversion not possible
-    :return int:  Converted value
+    :return float:  Converted value
     """
     try:
         return float(value)

From b097a54556cf5889ac82fc1f14b6bc62e0a3c5a5 Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Mon, 26 Aug 2024 17:20:18 +0200
Subject: [PATCH 159/204] Demove `dataset.file_exists()` in favour of more
 direct check

---
 common/lib/dataset.py                              | 14 --------------
 .../templates/components/result-result-row.html    |  2 +-
 webtool/templates/results.html                     |  2 +-
 3 files changed, 2 insertions(+), 16 deletions(-)

diff --git a/common/lib/dataset.py b/common/lib/dataset.py
index 1780a1ced..41c0b81e3 100644
--- a/common/lib/dataset.py
+++ b/common/lib/dataset.py
@@ -1502,20 +1502,6 @@ def is_from_collector(self):
 		"""
 		return self.type.endswith("-search") or self.type.endswith("-import")
 
-	def file_exists(self):
-		"""
-		Checks whether the result file for this dataset (still) exists.
-		Can be useful for checking the status of old dataset files that
-		may have been deleted. 
-
-		:return bool:
-		"""
-
-		if self.get_results_path().exists():
-			return True
-
-		return False
-
 	def get_extension(self):
 		"""
 		Gets the file extention this dataset produces.
diff --git a/webtool/templates/components/result-result-row.html b/webtool/templates/components/result-result-row.html
index 331eecdf8..7a6ed5aa5 100644
--- a/webtool/templates/components/result-result-row.html
+++ b/webtool/templates/components/result-result-row.html
@@ -15,7 +15,7 @@
         </div>
     {% endif %}
     </div>
-  {% if dataset.is_finished() and dataset.num_rows > 0 and dataset.file_exists() %}
+  {% if dataset.is_finished() and dataset.num_rows > 0 and dataset.get_results_path().exists() %}
     <ul class="dataset-actions">
     {% if dataset.get_own_processor().map_item or dataset.get_annotation_fields() %}
         <li>
diff --git a/webtool/templates/results.html b/webtool/templates/results.html
index 5b7450c23..339254a83 100644
--- a/webtool/templates/results.html
+++ b/webtool/templates/results.html
@@ -65,7 +65,7 @@ <h4>{{ dataset.get_label() }}</h4>
                         {% if not dataset.is_finished() or dataset.num_rows == 0 %}
                             <p class="button-like inactive{% if dataset.progress and dataset.progress > 0 and not dataset.is_finished() %} progress progress-{{ (dataset.progress * 100)|round(0)|int }}{% endif %}"><span class="dataset-status"><span class="result-status">{% include "components/result-status.html" %}</span></span></p>
                         {% else %}
-                            {% if dataset.file_exists() %}
+                            {% if dataset.get_results_path().exists() %}
                                 {% if dataset.get_own_processor().map_item or dataset.get_annotation_fields() %}
                                 <a class="button-like" href="{{ url_for('get_mapped_result', key=dataset.key) }}"><i class="fas fa-download" aria-hidden="true"></i> Download csv</a>
                                 {% else %}

From a7eb872edc596a9c2ed1f7c355c92169784bdf03 Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Mon, 26 Aug 2024 17:20:43 +0200
Subject: [PATCH 160/204] Show processor parameters for processor-generated
 annotations in the Explorer

---
 webtool/templates/explorer/post-annotations.html | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/webtool/templates/explorer/post-annotations.html b/webtool/templates/explorer/post-annotations.html
index 845a1f817..065dd9702 100644
--- a/webtool/templates/explorer/post-annotations.html
+++ b/webtool/templates/explorer/post-annotations.html
@@ -65,8 +65,8 @@
                     {% set metadata = annotation.metadata | fromjson %}
                     {% if metadata.get("processor-parameters") %}
                         <span class="tooltip-line">
-                        {% for parameter in metadata["processor-parameters"] %}
-                            <span class="property-badge"><i class="fa fa-check" aria-hidden="true"></i> {{ parameter }}</span>
+                        {% for parameter, input_value in metadata["processor-parameters"].items() %}
+                            <span class="property-badge">{{ parameter }}</span> {{ input_value }}
                         {% endfor %}
                         </span>
                     {% endif %}

From c553245c7c4c9bc0af3cdc6dd48e38f159e6fc81 Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Mon, 26 Aug 2024 17:21:03 +0200
Subject: [PATCH 161/204] Delete annotations when a dataset is deleted, also
 for child datasets.

---
 backend/workers/expire_items.py |  1 -
 common/lib/dataset.py           | 16 +++++++---------
 2 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/backend/workers/expire_items.py b/backend/workers/expire_items.py
index ddf8afbdb..ed4d1cc0f 100644
--- a/backend/workers/expire_items.py
+++ b/backend/workers/expire_items.py
@@ -62,7 +62,6 @@ def expire_datasets(self):
 				dataset = DataSet(key=dataset["key"], db=self.db)
 				if dataset.is_expired():
 					self.log.info(f"Deleting dataset {dataset.key} (expired)")
-					dataset.delete_annotations(dataset_key=dataset.key)
 					dataset.delete()
 
 			except DataSetNotFoundException:
diff --git a/common/lib/dataset.py b/common/lib/dataset.py
index 41c0b81e3..3f2b09958 100644
--- a/common/lib/dataset.py
+++ b/common/lib/dataset.py
@@ -538,11 +538,13 @@ def delete(self, commit=True):
 			try:
 				child = DataSet(key=child["key"], db=self.db)
 				child.delete(commit=commit)
+				child.delete_annotations()
 			except DataSetException:
 				# dataset already deleted - race condition?
 				pass
 
 		# delete from database
+		self.delete_annotations()
 		self.db.delete("datasets", where={"key": self.key}, commit=commit)
 		self.db.delete("datasets_owners", where={"key": self.key}, commit=commit)
 		self.db.delete("users_favourites", where={"key": self.key}, commit=commit)
@@ -1719,23 +1721,19 @@ def save_annotations(self, annotations: list, overwrite=True) -> int:
 
 		return count
 
-	def delete_annotations(self, dataset_key=None, id=None, field_id=None):
+	def delete_annotations(self, id=None, field_id=None):
 		"""
-		Deletes all annotations for an entire dataset or by a list of (field) IDs.
+		Deletes all annotations for an entire dataset.
+		If `id` or `field_id` are also given, it only deletes those annotations for this dataset.
 
-		:param str dataset_key: A dataset key.
 		:param li id:			A list or string of unique annotation IDs.
 		:param li field_id:		A list or string of IDs for annotation fields.
 
 		:return int: The number of removed records.
-	   """
+		"""
 
-		if not dataset_key and not id and not field_id:
-			return 0
+		where = {"dataset": self.key}
 
-		where = {}
-		if dataset_key:
-			where["dataset"] = dataset_key
 		if id:
 			where["id"] = id
 		if field_id:

From cea24de2c100938d014cce38f41709a1942b08f6 Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Mon, 26 Aug 2024 17:22:43 +0200
Subject: [PATCH 162/204] Don't use wrong variable in error handling when
 saving annotations

---
 common/lib/dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common/lib/dataset.py b/common/lib/dataset.py
index 3f2b09958..a146367f1 100644
--- a/common/lib/dataset.py
+++ b/common/lib/dataset.py
@@ -1675,7 +1675,7 @@ def save_annotations(self, annotations: list, overwrite=True) -> int:
 										  "the item it annotated, got %s" % annotation_data)
 			if "label" not in annotation_data or not isinstance(annotation_data["label"], str):
 				raise AnnotationException("Can't save annotations; annotation must have a `label` field, "
-										  "got %s" % annotation)
+										  "got %s" % annotation_data)
 			if not overwrite and annotation_data["label"] in annotation_labels:
 				raise AnnotationException("Can't save annotations; annotation field with label %s "
 										  "already exists" % annotation_data["label"])

From 9d81fb29cd248770024528cccf896fbea32b4b78 Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Mon, 26 Aug 2024 17:26:51 +0200
Subject: [PATCH 163/204] Don't use built-in variable name `id` in
 `Annotation()`

---
 common/lib/annotation.py | 47 ++++++++++++++++++++--------------------
 1 file changed, 23 insertions(+), 24 deletions(-)

diff --git a/common/lib/annotation.py b/common/lib/annotation.py
index ccb8a5ff6..b5b2fef6a 100644
--- a/common/lib/annotation.py
+++ b/common/lib/annotation.py
@@ -39,20 +39,20 @@ class Annotation:
     by_processor = None       # Whether the annotation was made by a processor
     metadata = None           # Misc metadata
 
-    def __init__(self, data=None, id=None, db=None):
+    def __init__(self, data=None, annotation_id=None, db=None):
         """
         Instantiate annotation object.
 
-        :param data:    Annotation data; should correspond to the annotations table record.
-        :param id:      The ID of an annotation. If given, it retrieves the annotation
-                        from the database.
-        :param db:      Database connection object
+        :param data:            Annotation data; should correspond to the annotations table record.
+        :param annotation_id:   The ID of an annotation. If given, it retrieves the annotation
+                                from the database.
+        :param db:              Database connection object
         """
 
         required_fields = ["label", "item_id", "dataset"]
 
         # Must have an ID or data
-        if (id is None and data is None) or (data is not None and not isinstance(data, dict)):
+        if (annotation_id is None and data is None) or (data is not None and not isinstance(data, dict)):
             raise AnnotationException("Annotation() requires either a valid `data` dictionary or ID.")
 
         if not db:
@@ -66,11 +66,11 @@ def __init__(self, data=None, id=None, db=None):
         # Get the annotation data if the ID is given; if an annotation has
         # an ID, it is guaranteed to be in the database.
         # IDs can both be explicitly given or present in the data dict.
-        if id is not None or "id" in data:
+        if annotation_id is not None or "id" in data:
             if data and "id" in data:
-                id = data["id"]
-            self.id = id # IDs correspond to unique serial numbers in the database.
-            current = self.get_by_id(id)
+                annotation_id = data["id"]
+            self.id = annotation_id # IDs correspond to unique serial numbers in the database.
+            current = self.get_by_id(annotation_id)
             if not current:
                 raise AnnotationException(
                     "Annotation() requires a valid ID for an existing annotation, %s given" % id)
@@ -147,21 +147,20 @@ def __init__(self, data=None, id=None, db=None):
             self.timestamp = int(time.time())
             self.write_to_db()
 
-    def get_by_id(self, id: int):
+    def get_by_id(self, annotation_id: int):
         """
         Get annotation by ID
 
-        :param str id:  ID of annotation
-        :param db:      Database connection object
+        :param str annotation_id:  ID of annotation.
         :return:  Annotation object, or an empty dict if the ID doesn't exist.
         """
 
         try:
-            int(id)
+            int(annotation_id)
         except ValueError:
-            raise AnnotationException("Id '%s' is not valid" % id)
+            raise AnnotationException("Id '%s' is not valid" % annotation_id)
 
-        data = self.db.fetchone("SELECT * FROM annotations WHERE id = %s" % (id))
+        data = self.db.fetchone("SELECT * FROM annotations WHERE id = %s" % (annotation_id))
 
         if not data:
             return {}
@@ -230,25 +229,25 @@ def delete(self):
         return self.db.delete("annotations", {"id": self.id})
 
     @staticmethod
-    def delete_many(db, dataset_key=None, id=None, field_id=None):
+    def delete_many(db, dataset_key=None, annotation_id=None, field_id=None):
         """
         Deletes annotations for an entire dataset or by a list of (field) IDs.
 
-        :param db:              Database object.
-        :param str dataset_key: A dataset key.
-        :param li id:			A list or string of unique annotation IDs.
-        :param li field_id:		A list or string of IDs for annotation fields.
+        :param db:                  Database object.
+        :param str dataset_key:     A dataset key.
+        :param li annotation_id:	A list or string of unique annotation IDs.
+        :param li field_id:		    A list or string of IDs for annotation fields.
 
         :return int: The number of removed records.
         """
-        if not dataset_key and not id and not field_id:
+        if not dataset_key and not annotation_id and not field_id:
             return 0
 
         where = {}
         if dataset_key:
             where["dataset"] = dataset_key
-        if id:
-            where["id"] = id
+        if annotation_id:
+            where["id"] = annotation_id
         if field_id:
             where["field_id"] = field_id
 

From 6e8f352e48ad286ea9ba38304ae865d947f06a4f Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Mon, 26 Aug 2024 17:27:03 +0200
Subject: [PATCH 164/204] Ibid

---
 common/lib/dataset.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/common/lib/dataset.py b/common/lib/dataset.py
index a146367f1..88c3d6a29 100644
--- a/common/lib/dataset.py
+++ b/common/lib/dataset.py
@@ -1594,8 +1594,8 @@ def get_annotations(self, item_id=[]) -> list:
 
 		# Then get the annotations by ID
 		ids = [i["id"] for i in ids]
-		for id in ids:
-			annotations.append(Annotation(id=id, db=self.db))
+		for annotation_id in ids:
+			annotations.append(Annotation(annotation_id=annotation_id, db=self.db))
 
 		return annotations
 

From 5393741922f5810e3b7feb7c800a3b8e06b1991c Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Mon, 26 Aug 2024 17:31:02 +0200
Subject: [PATCH 165/204] Make PyCharm happy with `annotations.py` formatting

---
 common/lib/annotation.py | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/common/lib/annotation.py b/common/lib/annotation.py
index b5b2fef6a..30ae2b523 100644
--- a/common/lib/annotation.py
+++ b/common/lib/annotation.py
@@ -60,7 +60,6 @@ def __init__(self, data=None, annotation_id=None, db=None):
 
         self.db = db
 
-        current = None
         new_or_updated = False
 
         # Get the annotation data if the ID is given; if an annotation has
@@ -69,7 +68,7 @@ def __init__(self, data=None, annotation_id=None, db=None):
         if annotation_id is not None or "id" in data:
             if data and "id" in data:
                 annotation_id = data["id"]
-            self.id = annotation_id # IDs correspond to unique serial numbers in the database.
+            self.id = annotation_id  # IDs correspond to unique serial numbers in the database.
             current = self.get_by_id(annotation_id)
             if not current:
                 raise AnnotationException(
@@ -109,7 +108,8 @@ def __init__(self, data=None, annotation_id=None, db=None):
             new_data = {
                 "dataset": data["dataset"],
                 "item_id": data["item_id"],
-                "field_id": data["field_id"] if data.get("field_id") else self.set_field_id(data["dataset"], data["label"]),
+                "field_id": data["field_id"]
+                if data.get("field_id") else self.set_field_id(data["dataset"], data["label"]),
                 "timestamp": created_timestamp,
                 "timestamp_created": created_timestamp,
                 "label": data["label"],
@@ -160,7 +160,7 @@ def get_by_id(self, annotation_id: int):
         except ValueError:
             raise AnnotationException("Id '%s' is not valid" % annotation_id)
 
-        data = self.db.fetchone("SELECT * FROM annotations WHERE id = %s" % (annotation_id))
+        data = self.db.fetchone("SELECT * FROM annotations WHERE id = %s" % annotation_id)
 
         if not data:
             return {}
@@ -184,7 +184,7 @@ def get_by_field(self, dataset_key: str, item_id: str, label: str) -> dict:
         """
 
         data = self.db.fetchone("SELECT * FROM annotations WHERE dataset = %s AND item_id = %s AND label = %s",
-                         (dataset_key, str(item_id), label))
+                                (dataset_key, str(item_id), label))
         if not data:
             return {}
 
@@ -215,7 +215,7 @@ def write_to_db(self):
         db_data = self.data
 
         db_data["timestamp"] = int(time.time())
-        m = db_data["metadata"] # To avoid circular reference error
+        m = db_data["metadata"]  # To avoid circular reference error
         db_data["metadata"] = json.dumps(m)
         if db_data["type"] == "checkbox":
             db_data["value"] = ",".join(db_data["value"])
@@ -369,8 +369,8 @@ def update_annotations_via_fields(dataset_key, old_fields: dict, new_fields: dic
                     if column == "options":
 
                         annotations = db.fetchall("SELECT id, options, value FROM annotations "
-                                                      "WHERE dataset = '%s' and field_id = '%s' AND value != '';"
-                                                       % (dataset_key, field_id))
+                                                  "WHERE dataset = '%s' and field_id = '%s' AND value != '';"
+                                                  % (dataset_key, field_id))
 
                         for annotation in annotations:
                             annotation_id = annotation["id"]
@@ -378,7 +378,7 @@ def update_annotations_via_fields(dataset_key, old_fields: dict, new_fields: dic
 
                             # Remove or rename options
                             new_values = []
-                            new_options = update_value["options"] # Dict with option id->label as items
+                            new_options = update_value["options"]  # Dict with option id->label as items
 
                             for ann_value in annotation_values:
                                 # Get the option ID, so we can see if it's new, deleted, or renamed.
@@ -418,7 +418,7 @@ def __setattr__(self, attr, value):
         """
         Setter so we can flexibly update the database
 
-        Also updates internal data stores (.data etc). If the attribute is
+        Also updates internal data stores (.data etc.). If the attribute is
         unknown, it is stored within the 'metadata' attribute.
 
         :param str attr:  Attribute to update
@@ -442,4 +442,4 @@ def __setattr__(self, attr, value):
 
         self.data[attr] = value
         if attr == "metadata":
-            self.metadata = json.loads(value)
\ No newline at end of file
+            self.metadata = json.loads(value)

From cbbd89f1a274c6b83d8d656ac57aeaeca3dcd678 Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Mon, 26 Aug 2024 17:33:56 +0200
Subject: [PATCH 166/204] Rename `hash_values()` helper function to
 `hash_to_md5()`

---
 backend/lib/processor.py        | 2 +-
 common/lib/annotation.py        | 4 ++--
 common/lib/dataset.py           | 6 +++---
 common/lib/helpers.py           | 4 ++--
 webtool/views/views_explorer.py | 4 ++--
 5 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/backend/lib/processor.py b/backend/lib/processor.py
index e5328b1fc..b756a28d5 100644
--- a/backend/lib/processor.py
+++ b/backend/lib/processor.py
@@ -17,7 +17,7 @@
 from backend.lib.worker import BasicWorker
 from common.lib.dataset import DataSet
 from common.lib.fourcat_module import FourcatModule
-from common.lib.helpers import get_software_commit, remove_nuls, send_email, hash_values
+from common.lib.helpers import get_software_commit, remove_nuls, send_email
 from common.lib.exceptions import (WorkerInterruptedException, ProcessorInterruptedException, ProcessorException,
 								   DataSetException, MapItemException, AnnotationException)
 from common.config_manager import config, ConfigWrapper
diff --git a/common/lib/annotation.py b/common/lib/annotation.py
index 30ae2b523..5af680946 100644
--- a/common/lib/annotation.py
+++ b/common/lib/annotation.py
@@ -6,7 +6,7 @@
 import time
 import json
 
-from common.lib.helpers import hash_values
+from common.lib.helpers import hash_to_md5
 from common.lib.exceptions import AnnotationException
 
 
@@ -204,7 +204,7 @@ def set_field_id(self, dataset_key: str, label: str) -> str:
         """
 
         base_field_id = dataset_key + label
-        field_id = hash_values(base_field_id)
+        field_id = hash_to_md5(base_field_id)
         self.field_id = field_id
         return self.field_id
 
diff --git a/common/lib/dataset.py b/common/lib/dataset.py
index 88c3d6a29..8cb75aaae 100644
--- a/common/lib/dataset.py
+++ b/common/lib/dataset.py
@@ -13,7 +13,7 @@
 from common.config_manager import config
 from common.lib.annotation import Annotation
 from common.lib.job import Job, JobNotFoundException
-from common.lib.helpers import get_software_commit, NullAwareTextIOWrapper, convert_to_int, hash_values
+from common.lib.helpers import get_software_commit, NullAwareTextIOWrapper, convert_to_int, hash_to_md5
 from common.lib.item_mapping import MappedItem, DatasetItem
 from common.lib.fourcat_module import FourcatModule
 from common.lib.exceptions import (ProcessorInterruptedException, DataSetException, DataSetNotFoundException,
@@ -964,7 +964,7 @@ def get_key(self, query, parameters, parent="", time_offset=0):
 
 		parent_key = str(parent) if parent else ""
 		plain_key = repr(param_key) + str(query) + parent_key
-		hashed_key = hash_values(plain_key)
+		hashed_key = hash_to_md5(plain_key)
 
 		if self.db.fetchone("SELECT key FROM datasets WHERE key = %s", (hashed_key,)):
 			# key exists, generate a new one
@@ -1694,7 +1694,7 @@ def save_annotations(self, annotations: list, overwrite=True) -> int:
 			# If we're not overwriting, create a new key with some salt.
 			if not overwrite:
 				if not field_id:
-					field_id = hash_values(annotation_data["dataset"] + annotation_data["label"] + salt)
+					field_id = hash_to_md5(annotation_data["dataset"] + annotation_data["label"] + salt)
 				if field_id in annotation_fields:
 					annotation_data["field_id"] = field_id
 
diff --git a/common/lib/helpers.py b/common/lib/helpers.py
index 847e2e25b..44963d89c 100644
--- a/common/lib/helpers.py
+++ b/common/lib/helpers.py
@@ -899,8 +899,8 @@ def folder_size(path='.'):
             total += folder_size(entry.path)
     return total
 
-def hash_values(string: str) -> str:
+def hash_to_md5(string: str) -> str:
     """
-    Hash a string
+    Hash a string with an md5 hash.
     """
     return hashlib.md5(string.encode("utf-8")).hexdigest()
\ No newline at end of file
diff --git a/webtool/views/views_explorer.py b/webtool/views/views_explorer.py
index 9182d0c68..8bcc1a209 100644
--- a/webtool/views/views_explorer.py
+++ b/webtool/views/views_explorer.py
@@ -12,7 +12,7 @@
 from webtool import app, db, openapi, limiter, config
 from webtool.lib.helpers import error, setting_required
 from common.lib.dataset import DataSet
-from common.lib.helpers import convert_to_float, hash_values
+from common.lib.helpers import convert_to_float, hash_to_md5
 from common.lib.exceptions import DataSetException, AnnotationException
 from common.config_manager import ConfigWrapper
 
@@ -175,7 +175,7 @@ def explorer_save_annotation_fields(dataset_key: str):
 	field_keys = list(annotation_fields.keys())
 	for field_id in field_keys:
 		if "tohash" in field_id:
-			new_field_id = hash_values(dataset_key + annotation_fields[field_id]["label"])
+			new_field_id = hash_to_md5(dataset_key + annotation_fields[field_id]["label"])
 			annotation_fields[new_field_id] = annotation_fields[field_id]
 			del annotation_fields[field_id]
 

From aa1ca5660c05f63fe548f5234745a47d0f7b4e06 Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Mon, 26 Aug 2024 18:54:12 +0200
Subject: [PATCH 167/204] Delete dummy annotator processor

---
 .../metrics/annotation_processor_test.py      | 49 -------------------
 1 file changed, 49 deletions(-)
 delete mode 100644 processors/metrics/annotation_processor_test.py

diff --git a/processors/metrics/annotation_processor_test.py b/processors/metrics/annotation_processor_test.py
deleted file mode 100644
index 5b7d19ac5..000000000
--- a/processors/metrics/annotation_processor_test.py
+++ /dev/null
@@ -1,49 +0,0 @@
-"""
-Collapse post bodies into one long string
-"""
-
-from common.lib.helpers import UserInput
-from backend.lib.processor import BasicProcessor
-
-from common.lib.exceptions import AnnotationException
-
-class AnnotatePosts(BasicProcessor):
-	"""
-	Merge post body into one long string
-	"""
-	type = "annotate-posts"  # job type ID
-	category = "Metrics"  # category
-	title = "Annotation test"  # title displayed in UI
-	description = "Ya know"  # description displayed in UI
-	extension = "csv"  # extension of result file, used internally and in UI
-
-	options = {
-		"overwrite": {
-			"type": UserInput.OPTION_TOGGLE,
-			"default": False,
-			"help": "Overwrite existing annotations by this processor?"
-		},
-		"field_label": {
-			"type": UserInput.OPTION_TEXT,
-			"default": ""
-		}
-	}
-
-	def process(self):
-		import random
-		annotations = []
-		try:
-			with self.dataset.get_results_path().open("w") as results:
-
-				for post in self.source_dataset.iterate_items(self):
-
-						annotation = {"item_id": post["id"],
-									  "label": self.parameters.get("field_label", ""),
-									  "value": random.randrange(1, 1000000)}
-						annotations.append(annotation)
-
-			if annotations:
-				self.write_annotations(annotations, overwrite=self.parameters.get("overwrite", False))
-				self.dataset.finish(1)
-		except AnnotationException as e:
-			self.dataset.finish_with_error(str(e))
\ No newline at end of file

From c8787081f5b5fd431cd9df807a52f80b198d4294 Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Mon, 26 Aug 2024 18:54:22 +0200
Subject: [PATCH 168/204] Fix typos in `annotation.py`

---
 common/lib/annotation.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/common/lib/annotation.py b/common/lib/annotation.py
index 5af680946..9b05bb197 100644
--- a/common/lib/annotation.py
+++ b/common/lib/annotation.py
@@ -25,11 +25,11 @@ class Annotation:
     db = None
 
     id = None                 # Unique ID for this annotation
-    item_id = None            # ID of the item for this annotation, e.g. post ID
-    field_id = None           # If of this type of annotation field for this dataset
+    item_id = None            # ID of the item that this annotation was made for, e.g. a post ID.
+    field_id = None           # ID for the annotation field
     dataset = None            # Dataset key this annotation is generated from
     timestamp = None          # When this annotation was edited
-    timestamp_created = None  # When this timestamp was created
+    timestamp_created = None  # When this annotation was created
     label = None              # Label of annotation
     type = None               # Type of annotation (e.g. `text`)
     options = None            # Possible options

From b8ebec4d82d77d370bdbe8decffd51c219b1faae Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Tue, 27 Aug 2024 10:45:44 +0200
Subject: [PATCH 169/204] Don't explicitly delete annotations for child
 datasets

---
 common/lib/dataset.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/common/lib/dataset.py b/common/lib/dataset.py
index 107d25150..51cad12dc 100644
--- a/common/lib/dataset.py
+++ b/common/lib/dataset.py
@@ -538,7 +538,6 @@ def delete(self, commit=True):
 			try:
 				child = DataSet(key=child["key"], db=self.db)
 				child.delete(commit=commit)
-				child.delete_annotations()
 			except DataSetException:
 				# dataset already deleted - race condition?
 				pass

From 4d16317a5db3931b658f77edc7d94da9fbf9fd5e Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Tue, 27 Aug 2024 11:54:24 +0200
Subject: [PATCH 170/204] Bump nltk

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index c3d71e2f1..1e2fe82f8 100644
--- a/setup.py
+++ b/setup.py
@@ -31,7 +31,7 @@
 	"lxml~=4.9.0",
 	"markdown==3.0.1",
 	"markdown2==2.4.2",
-	"nltk==3.9",
+	"nltk==3.9.1",
 	"networkx~=2.8.0",
 	"numpy>=1.19.2",
 	"opencv-python>=4.6.0.66",

From 0152645ff9f0e557e7a523241da7cf3a5b1fcfe3 Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Tue, 27 Aug 2024 15:18:50 +0200
Subject: [PATCH 171/204] Fix migrate script

---
 helper-scripts/migrate/migrate-1.45-1.46.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/helper-scripts/migrate/migrate-1.45-1.46.py b/helper-scripts/migrate/migrate-1.45-1.46.py
index 2894c1cb6..31109ce04 100644
--- a/helper-scripts/migrate/migrate-1.45-1.46.py
+++ b/helper-scripts/migrate/migrate-1.45-1.46.py
@@ -110,12 +110,12 @@
     # Each row are **all** annotations per dataset
     for row in annotations:
 
-        dataset = db.fetchone("SELECT * FROM datasets WHERE key = '" + row["dataset"] + "';")
+        dataset = db.fetchone("SELECT * FROM datasets WHERE key = '" + row["key"] + "';")
         # If the dataset is not present anymore,
         # we're going to skip these annotations;
         # likely the dataset is expired.
         if not dataset:
-            print("      No dataset found for key %s, skipping..." % row["dataset"])
+            print("      No dataset found for key %s, skipping..." % row["key"])
             skipped_count += 1
             continue
 
@@ -127,7 +127,7 @@
         author = dataset.get("creator", "")
 
         if not row.get("annotations"):
-            print("      No annotations for dataset %s, skipping..." % row["dataset"])
+            print("      No annotations for dataset %s, skipping..." % row["key"])
             skipped_count += 1
             continue
 

From 869aa1b7081b5346ba11079c2ae574afde56e472 Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Tue, 27 Aug 2024 17:38:22 +0200
Subject: [PATCH 172/204] Don't store processor metadata in annotations table;
 make join on datasets table later

---
 backend/lib/processor.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/backend/lib/processor.py b/backend/lib/processor.py
index 5e256551b..28ae4d567 100644
--- a/backend/lib/processor.py
+++ b/backend/lib/processor.py
@@ -767,11 +767,6 @@ def write_annotations(self, annotations: list, source_dataset=None, overwrite=Fa
 
 			annotation["by_processor"] = True
 
-			# Add processor parameters to annotation metadata
-			if not annotation.get("metadata"):
-				annotation["metadata"] = {}
-			annotation["metadata"]["processor-parameters"] = self.parameters
-
 		annotations_saved = source_dataset.save_annotations(annotations, overwrite=overwrite)
 		return annotations_saved
 

From 3e773356cd90078e34e347dca21864cd4fb2b4e1 Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Tue, 27 Aug 2024 17:38:40 +0200
Subject: [PATCH 173/204] Make property badges look nicer in Explorer

---
 webtool/static/css/dataset-page.css | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/webtool/static/css/dataset-page.css b/webtool/static/css/dataset-page.css
index 2ad53f90d..033480461 100644
--- a/webtool/static/css/dataset-page.css
+++ b/webtool/static/css/dataset-page.css
@@ -417,7 +417,7 @@ article.result > section:first-child {
     display: none;
 }
 
-.child-list .processor-header .property-badge {
+.child-list .processor-header .property-badge, p[role=tooltip] .property-badge {
     border: 0;
     background: var(--gray);
     display: inline;
@@ -772,8 +772,11 @@ body.image-preview {
     padding: 0.5em
 }
 
-.post-annotations .property-badge {
-    font-size: 13px;
+.post-annotations p[role="tooltip"] .property-badge {
+    display: inline-block;
+    margin: 2px 0px 2px 0px;
+    background-color: white;
+    border: 1px solid black;
 }
 
 #save-annotations-notice {

From ecf2441b1934035f813c2ffa27006c6639ea0c7d Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Tue, 27 Aug 2024 17:38:48 +0200
Subject: [PATCH 174/204] Ibid

---
 webtool/templates/explorer/post-annotations.html | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/webtool/templates/explorer/post-annotations.html b/webtool/templates/explorer/post-annotations.html
index 065dd9702..02e208bcb 100644
--- a/webtool/templates/explorer/post-annotations.html
+++ b/webtool/templates/explorer/post-annotations.html
@@ -66,7 +66,7 @@
                     {% if metadata.get("processor-parameters") %}
                         <span class="tooltip-line">
                         {% for parameter, input_value in metadata["processor-parameters"].items() %}
-                            <span class="property-badge">{{ parameter }}</span> {{ input_value }}
+                            <span class="property-badge">{{ parameter }}={{ input_value }}</span>
                         {% endfor %}
                         </span>
                     {% endif %}
@@ -80,7 +80,6 @@
                     <span class="annotation-author">{{ annotation.author }}</span>
                     <span class="annotation-options">{{ annotation_fields[field].get("options", {}).values() | join(",") }}</span>
                 </div>
-
 			</div>
 		{% endfor %}
 	{% endif %}

From 2c44ccb632c82bf1d8c4309ee14ba5b9d5a50291 Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Tue, 27 Aug 2024 17:39:02 +0200
Subject: [PATCH 175/204] Add Perspective API processor

---
 processors/text-analysis/perspective.py | 121 ++++++++++++++++++++++++
 1 file changed, 121 insertions(+)
 create mode 100644 processors/text-analysis/perspective.py

diff --git a/processors/text-analysis/perspective.py b/processors/text-analysis/perspective.py
new file mode 100644
index 000000000..7d22e4f18
--- /dev/null
+++ b/processors/text-analysis/perspective.py
@@ -0,0 +1,121 @@
+"""
+Get the toxicity score for items via Perspective API.
+"""
+import json
+
+from googleapiclient.errors import HttpError
+
+from common.lib.helpers import UserInput
+from backend.lib.processor import BasicProcessor
+from googleapiclient import discovery
+from google.auth.exceptions import DefaultCredentialsError
+
+from common.lib.exceptions import AnnotationException
+
+class Perspective(BasicProcessor):
+	"""
+	---
+	"""
+	type = "perspective"  # job type ID
+	category = "Text analysis"  # category
+	title = "Toxicity scores"  # title displayed in UI
+	description = ("Use the Perspective API to score text with attributes on toxicity, "
+					"including 'toxicity', 'insult', and 'profanity'.")		# description displayed in UI
+	extension = "ndjson"  # extension of result file, used internally and in UI
+
+	references = [
+		"[Perspective API documentation](https://developers.perspectiveapi.com/s/about-the-api)",
+		"[Rieder, Bernhard, and Yarden Skop. 2021. 'The fabrics of machine moderation: Studying the technical, "
+		"normative, and organizational structure of Perspective API.' Big Data & Society, 8(2).]"
+		"(https://doi.org/10.1177/20539517211046181)"
+	]
+
+	options = {
+		"api_key": {
+            "type": UserInput.OPTION_TEXT,
+            "help": "API Key",
+            "tooltip": "The API Key for the Google API account you want to query with. You can generate and find this "
+                       "key on the API dashboard.",
+			"sensitive": True
+        },
+        "attributes": {
+            "type": UserInput.OPTION_MULTI,
+            "help": "Attributes to score",
+            "options": {
+                "TOXICITY": "Toxicity",
+                "SEVERE_TOXICITY": "Severe toxicity",
+                "IDENTITY_ATTACK": "Identity attack",
+                "INSULT": "Insult",
+                "PROFANITY": "Profanity",
+                "THREAT": "Threat"
+            },
+            "default": ["TOXICITY"]
+        },
+		"write_annotations": {
+			"type": UserInput.OPTION_TOGGLE,
+			"help": "Add attribute scores as annotations to the parent dataset.",
+			"default": True
+		}
+	}
+
+	def process(self):
+
+		api_key = self.parameters.get("api_key")
+		self.dataset.delete_parameter("api_key")  # sensitive, delete after use
+
+		if not api_key:
+			self.dataset.finish_with_error("You need to provide a valid API key")
+			return
+
+		if not self.parameters.get("attributes"):
+			self.dataset.finish_with_error("You need to provide a at least one attribute to score")
+			return
+
+		write_annotations = self.parameters.get("api_key", True)
+
+		client = discovery.build(
+			"commentanalyzer",
+			"v1alpha1",
+			developerKey=api_key,
+			discoveryServiceUrl="https://commentanalyzer.googleapis.com/$discovery/rest?version=v1alpha1",
+			static_discovery=False,
+		)
+
+		results = []
+		annotations = []
+		api_attributes = {attribute: {} for attribute in self.parameters["attributes"]}
+
+		for item in self.source_dataset.iterate_items(self.source_file):
+
+			if item["body"]:
+
+				analyze_request = {
+					"comment": {"text": item["body"]},
+					"requestedAttributes": api_attributes
+				}
+
+				try:
+					response = client.comments().analyze(body=analyze_request).execute()
+				except HttpError as e:
+					self.dataset.update_status(str(e))
+					continue
+
+				results.append({item["id"]: response})
+
+				if write_annotations:
+					for attribute in self.parameters["attributes"]:
+						annotation = {
+							"label": attribute,
+							"item_id": item["id"],
+							"value": response["attributeScores"][attribute]["summaryScore"]["value"],
+						}
+						annotations.append(annotation)
+
+		if write_annotations:
+			self.write_annotations(annotations, overwrite=True)
+
+		with self.dataset.get_results_path().open("w", encoding="utf-8", newline="") as outfile:
+			for result in results:
+				outfile.write(json.dumps(result) + "\n")
+
+		self.dataset.finish(len(results))
\ No newline at end of file

From ce7836e88238fc20aece321d7300feae108585d8 Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Wed, 28 Aug 2024 10:24:56 +0200
Subject: [PATCH 176/204] Disable disabled buttons in Explorer (resolves
 https://github.com/digitalmethodsinitiative/4cat/pull/428#discussion_r1732964422_
 )

---
 webtool/static/js/explorer.js | 41 +++++++++++++++++++++--------------
 1 file changed, 25 insertions(+), 16 deletions(-)

diff --git a/webtool/static/js/explorer.js b/webtool/static/js/explorer.js
index ed0214f0c..b79401de6 100644
--- a/webtool/static/js/explorer.js
+++ b/webtool/static/js/explorer.js
@@ -57,11 +57,13 @@ const annotations = {
 
 		// Show and hide annotations
 		$("#toggle-annotations").on("click", function(){
-			if ($(this).hasClass("shown")) {
-				annotations.hideAnnotations();
-			}
-			else {
-				annotations.showAnnotations();
+			if (!$(this).hasClass("disabled")) {
+				if ($(this).hasClass("shown")) {
+					annotations.hideAnnotations();
+				}
+				else {
+					annotations.showAnnotations();
+				}
 			}
 		});
 
@@ -112,9 +114,11 @@ const annotations = {
 
 		// Save the annotations to the database
 		$("#save-annotations").on("click", function(){
-			clearTimeout(save_timer);
-			save_timer = null;
-			annotations.saveAnnotations();
+			if (!$(this).hasClass("disabled")) {
+				clearTimeout(save_timer);
+				save_timer = null;
+				annotations.saveAnnotations();
+			}
 		});
 
 		// Check whether there's already fields saved for this dataset
@@ -358,8 +362,7 @@ const annotations = {
 		// the annotation fields to each post on the page.
 		else {
 
-			$("#apply-annotation-fields").html("<i class='fas fa-circle-notch spinner'></i> Applying")
-			
+
 			// Remove warnings
 			annotations.warnEditor("")
 			$("#annotation-field").find("input").each(function(){
@@ -375,6 +378,7 @@ const annotations = {
 				annotations.checkFieldChanges(new_annotation_fields, annotation_fields);
 			}
 			else {
+				$("#apply-annotation-fields").html("<i class='fas fa-circle-notch spinner'></i> Applying")
 				annotations.saveAnnotationFields(new_annotation_fields);
 			}
 		}
@@ -567,15 +571,17 @@ const annotations = {
 
 	showAnnotations: function() {
 
+		// Change button
+		let ta = $("#toggle-annotations");
+		ta.addClass("shown");
+		ta.html("<i class='fas fa-eye-slash'></i> Hide annotations");
+
 		// Store state in URL params
 		let queryParams = new URLSearchParams(window.location.search);
 		queryParams.set("show", "true");
 		history.replaceState(null, null, "?"+queryParams.toString());
 
-		let ta = $("#toggle-annotations");
-		ta.addClass("shown");
-		ta.html("<i class='fas fa-eye-slash'></i> Hide annotations");
-		// Bit convoluted, but necessary to have auto height
+		// Show/hide annotations div. Bit convoluted, but necessary to have auto height.
 		let pa = $(".post-annotations");
 		let current_height = pa.height();
 		let auto_height = pa.css("height", "auto").height();
@@ -694,8 +700,11 @@ const page_functions = {
 		// and change the dropdown sort option based on the sort parameter.
 		let searchParams = new URLSearchParams(window.location.search)
 		let show_annotations = searchParams.get("show");
-		if (show_annotations) {
-			annotations.showAnnotations();
+		// Never show annotations if there's no annotation fields
+		if (annotation_fields) {
+			if (show_annotations) {
+				annotations.showAnnotations();
+			}
 		}
 
 		let selected = searchParams.get("sort");

From ff2bf7e2ff16589cd660f9bbb1528f67a9eab548 Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Wed, 28 Aug 2024 10:28:32 +0200
Subject: [PATCH 177/204] Don't fail dataset sorting in Explorer views if the
 field is missing (can happen with wrong URL parameter, or deleted input
 field).

---
 webtool/views/views_explorer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/webtool/views/views_explorer.py b/webtool/views/views_explorer.py
index 8bcc1a209..ea7073e6c 100644
--- a/webtool/views/views_explorer.py
+++ b/webtool/views/views_explorer.py
@@ -248,13 +248,13 @@ def sort_and_iterate_items(dataset: DataSet, sort="", reverse=False, **kwargs) -
 	# Sort on the basis of a column value
 	else:
 		try:
-			for item in sorted(dataset.iterate_items(**kwargs), key=lambda x: x[sort], reverse=reverse):
+			for item in sorted(dataset.iterate_items(**kwargs), key=lambda x: x.get(sort,""), reverse=reverse):
 				sorted_posts.append(item)
 		except TypeError:
 			# Dataset fields can contain integers and empty strings.
 			# Since these cannot be compared, we will convert every
 			# empty string to 0.
-			for item in sorted(dataset.iterate_items(**kwargs), key=lambda x: convert_to_float(x[sort]), reverse=reverse):
+			for item in sorted(dataset.iterate_items(**kwargs), key=lambda x: convert_to_float(x.get(sort,"")), reverse=reverse):
 				sorted_posts.append(item)
 
 	for post in sorted_posts:

From 55d81f5217feb6380b6ff014bb67384401833fb1 Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Wed, 28 Aug 2024 10:43:41 +0200
Subject: [PATCH 178/204] Only do mapped item things when a row is already a
 mapped item

---
 common/lib/dataset.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/common/lib/dataset.py b/common/lib/dataset.py
index 51cad12dc..d6984c7f8 100644
--- a/common/lib/dataset.py
+++ b/common/lib/dataset.py
@@ -382,14 +382,14 @@ def iterate_items(self, processor=None, warn_unmappable=True, map_missing="defau
 			# Add possible annotations
 			if annotation_labels:
 
-				# Get annotations for this specific post
-				post_annotations = self.get_annotations(item_id=mapped_item.data["id"])
-
 				# We're always handling annotated data as a MappedItem object,
 				# even if no map_item() function is available for the data source.
 				if not isinstance(mapped_item, MappedItem):
 					mapped_item = MappedItem(mapped_item)
 
+				# Get annotations for this specific post
+				post_annotations = self.get_annotations(item_id=mapped_item.data["id"])
+
 				for annotation_label in annotation_labels:
 					value = ""
 					for post_annotation in post_annotations:

From 475692cb4572c492f0fe4db95ffee26d87eba421 Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Wed, 28 Aug 2024 17:22:23 +0200
Subject: [PATCH 179/204] Better description in Explorer save tooltip

---
 webtool/templates/explorer/controls.html | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/webtool/templates/explorer/controls.html b/webtool/templates/explorer/controls.html
index 98c8f01f1..421ab3708 100644
--- a/webtool/templates/explorer/controls.html
+++ b/webtool/templates/explorer/controls.html
@@ -27,7 +27,7 @@ <h2>
 						<li class="annotation-control-button"><a class="button-like-small {% if not annotation_fields %} disabled{% endif %}" id="save-annotations"><i class="fas fa-save"></i> Save annotations</a>
 						<button class="tooltip-trigger" aria-controls="tooltip-save-annotations">?</button>
 						<p role="tooltip" id="tooltip-save-annotations" aria-hidden="true">
-							Annotations are saved automatically every ten seconds. Use <strong><i class="fas fa-download"></i>Download csv</strong> on the dataset overview page to download a csv with annotations added as columns.
+							Annotations are also saved automatically after editing. Use <strong><i class="fas fa-download"></i>Download csv</strong> on the dataset overview page to download this dataset with annotations as new columns.
 						</p>
 						</li>
 					</ul>

From 8811cae3c748819161f22cc0e190628da3532576 Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Wed, 28 Aug 2024 17:22:37 +0200
Subject: [PATCH 180/204] utf-8 in csv->excel processor

---
 processors/conversion/csv_to_excel.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/processors/conversion/csv_to_excel.py b/processors/conversion/csv_to_excel.py
index fe8139748..a571f287a 100644
--- a/processors/conversion/csv_to_excel.py
+++ b/processors/conversion/csv_to_excel.py
@@ -58,7 +58,7 @@ def process(self):
 		)
 
 		# recreate CSV file with the new dialect
-		with self.dataset.get_results_path().open("w") as output:
+		with self.dataset.get_results_path().open("w", encoding="utf-8") as output:
 			fieldnames = self.source_dataset.get_item_keys(self)
 
 			writer = csv.DictWriter(output, fieldnames=fieldnames, dialect="excel-mac")

From bc342ae2478bf327853762682e1aa14d662fa580 Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Wed, 28 Aug 2024 17:22:49 +0200
Subject: [PATCH 181/204] Uneccesary import

---
 processors/machine_learning/text_from_image.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/processors/machine_learning/text_from_image.py b/processors/machine_learning/text_from_image.py
index f8fa4d645..def05e411 100644
--- a/processors/machine_learning/text_from_image.py
+++ b/processors/machine_learning/text_from_image.py
@@ -11,7 +11,7 @@
 
 from common.config_manager import config
 from common.lib.dmi_service_manager import DmiServiceManager, DsmOutOfMemory, DmiServiceManagerException
-from common.lib.helpers import UserInput, convert_to_int
+from common.lib.helpers import UserInput
 from backend.lib.processor import BasicProcessor
 from common.lib.exceptions import ProcessorInterruptedException, ProcessorException
 from common.lib.item_mapping import MappedItem

From 8652ba2e03584cd5b8e7c81a762f5432f54a5be6 Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Wed, 28 Aug 2024 17:23:16 +0200
Subject: [PATCH 182/204] Add `map_item()` to Perspective processor

---
 processors/text-analysis/perspective.py | 33 ++++++++++++++++++++-----
 1 file changed, 27 insertions(+), 6 deletions(-)

diff --git a/processors/text-analysis/perspective.py b/processors/text-analysis/perspective.py
index 7d22e4f18..348be9c84 100644
--- a/processors/text-analysis/perspective.py
+++ b/processors/text-analysis/perspective.py
@@ -8,13 +8,11 @@
 from common.lib.helpers import UserInput
 from backend.lib.processor import BasicProcessor
 from googleapiclient import discovery
-from google.auth.exceptions import DefaultCredentialsError
-
-from common.lib.exceptions import AnnotationException
+from common.lib.item_mapping import MappedItem
 
 class Perspective(BasicProcessor):
 	"""
-	---
+	Score items with toxicity and other scores through Google Jigsaw's Perspective API.
 	"""
 	type = "perspective"  # job type ID
 	category = "Text analysis"  # category
@@ -100,7 +98,9 @@ def process(self):
 					self.dataset.update_status(str(e))
 					continue
 
-				results.append({item["id"]: response})
+				response["item_id"] = item["id"]
+				response["body"] = item["body"]
+				results.append(response)
 
 				if write_annotations:
 					for attribute in self.parameters["attributes"]:
@@ -111,11 +111,32 @@ def process(self):
 						}
 						annotations.append(annotation)
 
+		# Write annotations
 		if write_annotations:
 			self.write_annotations(annotations, overwrite=True)
 
+		# Write to file
 		with self.dataset.get_results_path().open("w", encoding="utf-8", newline="") as outfile:
 			for result in results:
 				outfile.write(json.dumps(result) + "\n")
 
-		self.dataset.finish(len(results))
\ No newline at end of file
+		self.dataset.finish(len(results))
+
+	@staticmethod
+	def map_item(item):
+
+		# print({"item_id": item["id"],
+		# 	"body": item.get("body"),
+		# 	**attribute_scores})
+
+		attribute_scores = {}
+		all_attributes = ["TOXICITY", "SEVERE_TOXICITY", "IDENTITY_ATTACK", "INSULT", "PROFANITY", "THREAT"]
+		for att in all_attributes:
+			if att in item["attributeScores"]:
+				attribute_scores[att] = item["attributeScores"][att]["summaryScore"]["value"]
+		print(item)
+		return MappedItem({
+			"item_id": item["item_id"],
+			"body": item.get("body"),
+			**attribute_scores
+		})

From 2ab9e86a901959ceef0d30fcbf1a2f538770f4a0 Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Wed, 28 Aug 2024 17:23:42 +0200
Subject: [PATCH 183/204] Make mapped csvs downloadable for processor from the
 UI

---
 webtool/static/css/dataset-page.css            | 2 ++
 webtool/templates/components/result-child.html | 9 +++------
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/webtool/static/css/dataset-page.css b/webtool/static/css/dataset-page.css
index 033480461..56edd4aac 100644
--- a/webtool/static/css/dataset-page.css
+++ b/webtool/static/css/dataset-page.css
@@ -326,6 +326,7 @@ article.result > section:first-child {
 }
 
 .child-wrapper .button-wrap.processor-result-indicator a {
+    display: inline-block;
     color: inherit;
 }
 
@@ -340,6 +341,7 @@ article.result > section:first-child {
 }
 
 .child-wrapper .button-wrap .byline {
+    cursor: default;
     font-size: 0.8em;
 }
 
diff --git a/webtool/templates/components/result-child.html b/webtool/templates/components/result-child.html
index 4b0b16588..e04013b8b 100644
--- a/webtool/templates/components/result-child.html
+++ b/webtool/templates/components/result-child.html
@@ -10,14 +10,14 @@
 
     <div class="button-object">
 
-    {% if item.is_finished() and not is_filtered %}<a href="{{ url_for('get_result', query_file=item.result_file) }}">{% endif %}
         <span class="headline">
-            {% if item.is_finished() and not is_filtered %}<a href="{{ url_for('get_result', query_file=item.result_file) }}">{% endif %}
             {% if item.is_finished() and is_filtered and "copied_to" in item.parameters %}<a href="{{ url_for('show_result', key=item.parameters.copied_to) }}">{% endif %}
             {% if is_filtered %}
             <i class="fa fa-filter" aria-hidden="true"></i> <span class="sr-only">Filter</span>
             {% elif item.is_finished() and item.num_rows > 0 %}
-            <i class="fa fa-download" aria-hidden="true"></i> <span class="sr-only">Download</span> {{ processors[item.type].extension if item.type in processors else "" }}, {{ item.get_results_path()|filesize_short }}
+                {% if item.get_own_processor().map_item and item.get_extension() != "csv" %}<a href="{{ url_for('get_mapped_result', key=item.key) }}"><i class="fa fa-download" aria-hidden="true"></i> <span class="sr-only">Download</span> csv</a>
+                {% endif %}
+                <a href="{{ url_for('get_result', query_file=item.result_file) }}"><i class="fa fa-download" aria-hidden="true"></i> <span class="sr-only">Download</span> {{ processors[item.type].extension if item.type in processors else "" }}, {{ item.get_results_path()|filesize_short }}</a>
             {% elif item.is_finished() %}
             <i class="fa fa-skull" aria-hidden="true"></i>
             {% elif "queued" in item.status|lower %}
@@ -25,10 +25,8 @@
             {% else %}
             <i class="fa fa-spin fa-sync-alt" aria-hidden="true"></i>
             {% endif %}
-            {% if item.is_finished() and ("copied_to" in item.parameters or not is_filtered) %}</a>{% endif %}
         </span>
         <span class="byline">
-            {% if item.is_finished() and not is_filtered %}<a href="{{ url_for('get_result', query_file=item.result_file) }}">{% endif %}
             {% if item.is_finished() and is_filtered and "copied_to" in item.parameters %}<a href="{{ url_for('show_result', key=item.parameters.copied_to) }}">{% endif %}
             {% if item.is_finished() and item.num_rows >= 0 %}
                 {% if is_filtered and "copied_to" in item.parameters %}
@@ -52,7 +50,6 @@
         </span>
         <span class="queue-notice"></span>
 
-    {% if item.is_finished() and not is_filtered %}</a>{% endif %}
     </div>
     </div>
 

From 4fcafb9dfd2d669195cfde20b54c1e2dce52a4ac Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Fri, 30 Aug 2024 19:28:05 +0200
Subject: [PATCH 184/204] Don't print in perspective processor

---
 processors/text-analysis/perspective.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/processors/text-analysis/perspective.py b/processors/text-analysis/perspective.py
index 348be9c84..1b8aa49b5 100644
--- a/processors/text-analysis/perspective.py
+++ b/processors/text-analysis/perspective.py
@@ -134,7 +134,7 @@ def map_item(item):
 		for att in all_attributes:
 			if att in item["attributeScores"]:
 				attribute_scores[att] = item["attributeScores"][att]["summaryScore"]["value"]
-		print(item)
+
 		return MappedItem({
 			"item_id": item["item_id"],
 			"body": item.get("body"),

From 9d322df72e35fdc037c2d8a705e6ec585e811529 Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Mon, 2 Sep 2024 18:46:47 +0200
Subject: [PATCH 185/204] Move perspective processor

---
 processors/{text-analysis => machine_learning}/perspective.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
 rename processors/{text-analysis => machine_learning}/perspective.py (99%)

diff --git a/processors/text-analysis/perspective.py b/processors/machine_learning/perspective.py
similarity index 99%
rename from processors/text-analysis/perspective.py
rename to processors/machine_learning/perspective.py
index 1b8aa49b5..d09ed2359 100644
--- a/processors/text-analysis/perspective.py
+++ b/processors/machine_learning/perspective.py
@@ -15,7 +15,7 @@ class Perspective(BasicProcessor):
 	Score items with toxicity and other scores through Google Jigsaw's Perspective API.
 	"""
 	type = "perspective"  # job type ID
-	category = "Text analysis"  # category
+	category = "Machine learning"  # category
 	title = "Toxicity scores"  # title displayed in UI
 	description = ("Use the Perspective API to score text with attributes on toxicity, "
 					"including 'toxicity', 'insult', and 'profanity'.")		# description displayed in UI

From 057dd14027b1af4ac68f7a7e96e2b1f34cfc410b Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Mon, 2 Sep 2024 18:46:58 +0200
Subject: [PATCH 186/204] Typo in Tumblr search

---
 datasources/tumblr/search_tumblr.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/datasources/tumblr/search_tumblr.py b/datasources/tumblr/search_tumblr.py
index ddf02f023..0b1b5bb78 100644
--- a/datasources/tumblr/search_tumblr.py
+++ b/datasources/tumblr/search_tumblr.py
@@ -91,7 +91,7 @@ class SearchTumblr(Search):
 	@classmethod
 	def get_options(cls, parent_dataset=None, user=None):
 		"""
-		Check is Tumbler keys configured and if not, requests from User
+		Check if Tumblr keys configured and if not, requests from User
 		"""
 		options = {
 			"intro": {

From 05e52c01770258c9860e48b80f4e57adb49c8575 Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Mon, 2 Sep 2024 18:47:13 +0200
Subject: [PATCH 187/204] Add GPT processor

---
 processors/machine_learning/gpt.py | 184 +++++++++++++++++++++++++++++
 1 file changed, 184 insertions(+)
 create mode 100644 processors/machine_learning/gpt.py

diff --git a/processors/machine_learning/gpt.py b/processors/machine_learning/gpt.py
new file mode 100644
index 000000000..0e4527301
--- /dev/null
+++ b/processors/machine_learning/gpt.py
@@ -0,0 +1,184 @@
+"""
+Prompt OpenAI GPT LLMs.
+"""
+
+import json
+import re
+import openai
+
+from common.lib.helpers import UserInput
+from backend.lib.processor import BasicProcessor
+from common.config_manager import config
+
+class GPT(BasicProcessor):
+	"""
+	Prompt OpenAI's GPT models
+	"""
+	type = "gpt"  # job type ID
+	category = "Machine learning"  # category
+	title = "GPT prompting"  # title displayed in UI
+	description = ("Use OpenAI's GPT LLMs to generate outputs based on the parent dataset. "
+				   "Note: Be very sensitive with running this processor on your datasets, "
+				   "as data will be given to OpenAI.") # description displayed in UI
+	extension = "csv"  # extension of result file, used internally and in UI. In this case it's variable!
+
+	references = [
+		"[OpenAPI documentation](https://platform.openai.com/docs/concepts)",
+		"[Karjus, Andres. 2023. Machine-assisted mixed methods: augmenting humanities and social sciences "
+		"with artificial intelligence. arXiv preprint arXiv:2309.14379.]"
+		"(https://arxiv.org/abs/2309.14379)",
+		"[Törnberg, Petter. 2023. How to Use LLMs for Text Analysis. arXiv:2307.13106]."
+		"(https://arxiv.org/abs/2307.13106)"
+	]
+
+	config = {
+		"api.openai.api_key": {
+			"type": UserInput.OPTION_TEXT,
+			"default": "",
+			"help": "OpenAI API key",
+			"tooltip": "Can be created on platform.openapi.com"
+		}
+	}
+
+	@classmethod
+	def get_options(cls, parent_dataset=None, user=None):
+		options = {
+			"model": {
+				"type": UserInput.OPTION_CHOICE,
+				"help": "Model",
+				"options": {
+					"gpt-4o-mini": "GPT-4o mini",
+					"gpt-4o": "GPT-4o",
+					"gpt-4-turbo": "GPT-4 turbo",
+				},
+				"default": "gpt-4o-mini"
+			},
+			"per_item": {
+				"type": UserInput.OPTION_INFO,
+				"help": "Outputs are generated per row in the parent dataset. Use [brackets] with a column name to indicate where and what dataset value you want to use, e.g.: 'Determine the language of the following text: [body]')",
+			},
+			"prompt": {
+				"type": UserInput.OPTION_TEXT_LARGE,
+				"help": "Prompt"
+			},
+			"temperature": {
+				"type": UserInput.OPTION_TEXT,
+				"help": "Temperature",
+				"default": 0.5
+			},
+			"max_tokens": {
+				"type": UserInput.OPTION_TEXT,
+				"help": "Max tokens",
+				"default": 50
+			},
+			"write_annotations": {
+				"type": UserInput.OPTION_TOGGLE,
+				"help": "Add output as annotations to the parent dataset.",
+				"default": True
+			}
+		}
+
+		api_key = config.get("api.openai.api_key", user=user)
+		if not api_key:
+			options["api_key"] = {
+				"type": UserInput.OPTION_TEXT,
+				"default": "",
+				"help": "OpenAI API key",
+				"tooltip": "Can be created on platform.openapi.com"
+			}
+
+		return options
+
+	def process(self):
+
+		model = self.parameters.get("model")
+
+		api_key = self.parameters.get("api_key")
+		if not api_key:
+			api_key = config.get("api.openai.api_key", user=self.owner)
+		if not api_key:
+			self.dataset.finish_with_error("You need to provide a valid API key")
+			return
+
+		try:
+			temperature = float(self.parameters.get("temperature"))
+		except ValueError:
+			self.dataset.finish_with_error("Temperature must be a number")
+
+		try:
+			max_tokens = int(self.parameters.get("max_tokens"))
+		except ValueError:
+			self.dataset.finish_with_error("Max tokens must be a number")
+
+		self.dataset.delete_parameter("api_key")  # sensitive, delete after use
+
+		base_prompt = self.parameters.get("prompt", "")
+
+		if not base_prompt:
+			self.dataset.finish_with_error("You need to insert a valid prompt")
+			return
+
+		replacements = re.findall(r"\[(.*?)\]", base_prompt)
+		if not replacements:
+			self.dataset.finish_with_error("You need to provide the prompt with input values using [brackets] of "
+										   "column names.")
+
+		write_annotations = self.parameters.get("write_annotations", True)
+		annotations = []
+
+		results = []
+
+		# initiate
+		client = openai.OpenAI(api_key=api_key)
+		i = 1
+
+		for item in self.source_dataset.iterate_items():
+
+			# Replace with dataset values
+			prompt = base_prompt
+
+			for replacement in replacements:
+				prompt = prompt.replace(replacement, str(item.get(replacement, "")))
+
+			response = self.prompt_gpt(prompt, client, model=model, temperature=temperature, max_tokens=max_tokens)
+
+			response = response.choices[0].message.content
+			results.append({
+				"item_id": item["id"],
+				"prompt": prompt,
+				model + " output": response
+			})
+
+			if write_annotations:
+				annotation = {
+					"label": model + " output",
+					"item_id": item["id"],
+					"value": response,
+					"type": "textarea"
+				}
+				annotations.append(annotation)
+
+			self.dataset.update_status("Generated output for item %s/%s" % (i, self.source_dataset.num_rows))
+
+		# Write annotations
+		if write_annotations:
+			self.write_annotations(annotations, overwrite=True)
+
+		# Write to csv file
+		self.write_csv_items_and_finish(results)
+
+	@staticmethod
+	def prompt_gpt(prompt, client, model="gpt-4-turbo", temperature=0.2, max_tokens=50):
+
+		# Get response
+		response = client.chat.completions.create(
+			model=model,
+			temperature=temperature,
+			max_tokens=max_tokens,
+			messages=[{
+				"role": "user",
+				"content": prompt
+			}]
+		)
+
+		return response

From cfc1e0e86af2e3976515e4f9fd65ddc8db542461 Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Fri, 6 Sep 2024 16:07:32 +0200
Subject: [PATCH 188/204] Fix GPT processor and make it compatible with any
 NDJSON or CSV file

---
 processors/machine_learning/gpt.py | 33 +++++++++++++++++++++---------
 1 file changed, 23 insertions(+), 10 deletions(-)

diff --git a/processors/machine_learning/gpt.py b/processors/machine_learning/gpt.py
index 0e4527301..b37a3c754 100644
--- a/processors/machine_learning/gpt.py
+++ b/processors/machine_learning/gpt.py
@@ -24,10 +24,10 @@ class GPT(BasicProcessor):
 
 	references = [
 		"[OpenAPI documentation](https://platform.openai.com/docs/concepts)",
-		"[Karjus, Andres. 2023. Machine-assisted mixed methods: augmenting humanities and social sciences "
-		"with artificial intelligence. arXiv preprint arXiv:2309.14379.]"
+		"[Karjus, Andres. 2023. 'Machine-assisted mixed methods: augmenting humanities and social sciences "
+		"with artificial intelligence.' arXiv preprint arXiv:2309.14379.]"
 		"(https://arxiv.org/abs/2309.14379)",
-		"[Törnberg, Petter. 2023. How to Use LLMs for Text Analysis. arXiv:2307.13106]."
+		"[Törnberg, Petter. 2023. 'How to Use LLMs for Text Analysis.' arXiv:2307.13106.]"
 		"(https://arxiv.org/abs/2307.13106)"
 	]
 
@@ -89,6 +89,16 @@ def get_options(cls, parent_dataset=None, user=None):
 
 		return options
 
+	@classmethod
+	def is_compatible_with(cls, module=None, user=None):
+		"""
+		Determine if processor is compatible with a dataset or processor
+
+		:param module: Module to determine compatibility with
+		"""
+
+		return module.get_extension() in ["csv", "ndjson"]
+
 	def process(self):
 
 		model = self.parameters.get("model")
@@ -123,7 +133,7 @@ def process(self):
 			self.dataset.finish_with_error("You need to provide the prompt with input values using [brackets] of "
 										   "column names.")
 
-		write_annotations = self.parameters.get("write_annotations", True)
+		write_annotations = self.parameters.get("write_annotations", False)
 		annotations = []
 
 		results = []
@@ -136,32 +146,35 @@ def process(self):
 
 			# Replace with dataset values
 			prompt = base_prompt
-
 			for replacement in replacements:
-				prompt = prompt.replace(replacement, str(item.get(replacement, "")))
+				prompt = prompt.replace(replacement, str(item[replacement]))
 
 			response = self.prompt_gpt(prompt, client, model=model, temperature=temperature, max_tokens=max_tokens)
 
+			item_id = item["id"] if "id" in item else item.get("item_id", "")
+
 			response = response.choices[0].message.content
 			results.append({
-				"item_id": item["id"],
+				"id": item_id,
 				"prompt": prompt,
 				model + " output": response
 			})
 
-			if write_annotations:
+			# todo: make this available for all datasets
+			if self.source_dataset.is_top_dataset() and write_annotations:
 				annotation = {
 					"label": model + " output",
-					"item_id": item["id"],
+					"item_id": item_id,
 					"value": response,
 					"type": "textarea"
 				}
 				annotations.append(annotation)
 
 			self.dataset.update_status("Generated output for item %s/%s" % (i, self.source_dataset.num_rows))
+			i += 1
 
 		# Write annotations
-		if write_annotations:
+		if self.source_dataset.is_top_dataset() and write_annotations:
 			self.write_annotations(annotations, overwrite=True)
 
 		# Write to csv file

From b7ba19a4d6d5d7b7f5c2776f1cca16aafd81435d Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Mon, 9 Sep 2024 17:07:13 +0200
Subject: [PATCH 189/204] Don't fail migrate script in edge case when new
 annotations table is already created

---
 helper-scripts/migrate/migrate-1.45-1.46.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/helper-scripts/migrate/migrate-1.45-1.46.py b/helper-scripts/migrate/migrate-1.45-1.46.py
index 31109ce04..2b764abd7 100644
--- a/helper-scripts/migrate/migrate-1.45-1.46.py
+++ b/helper-scripts/migrate/migrate-1.45-1.46.py
@@ -100,6 +100,9 @@
 if not annotations:
     print("    No annotation fields to transfer, skipping...")
 
+elif  "key" not in annotations[0] and "dataset" in annotations[0]:
+    print("    Annotations table seems to have been updated already")
+
 else:
     
     count = 0

From 94f415d3ee12572612d9aadd5b97a7855b04a68e Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Mon, 9 Sep 2024 18:54:36 +0200
Subject: [PATCH 190/204] Improve GPT Prompting processor, add some more error
 handling and friction

---
 processors/machine_learning/gpt.py | 92 ++++++++++++++++++++++--------
 1 file changed, 69 insertions(+), 23 deletions(-)

diff --git a/processors/machine_learning/gpt.py b/processors/machine_learning/gpt.py
index b37a3c754..6a194998d 100644
--- a/processors/machine_learning/gpt.py
+++ b/processors/machine_learning/gpt.py
@@ -17,9 +17,7 @@ class GPT(BasicProcessor):
 	type = "gpt"  # job type ID
 	category = "Machine learning"  # category
 	title = "GPT prompting"  # title displayed in UI
-	description = ("Use OpenAI's GPT LLMs to generate outputs based on the parent dataset. "
-				   "Note: Be very sensitive with running this processor on your datasets, "
-				   "as data will be given to OpenAI.") # description displayed in UI
+	description = ("Use OpenAI's GPT LLMs to generate outputs based on the parent dataset.") # description displayed in UI
 	extension = "csv"  # extension of result file, used internally and in UI. In this case it's variable!
 
 	references = [
@@ -27,22 +25,41 @@ class GPT(BasicProcessor):
 		"[Karjus, Andres. 2023. 'Machine-assisted mixed methods: augmenting humanities and social sciences "
 		"with artificial intelligence.' arXiv preprint arXiv:2309.14379.]"
 		"(https://arxiv.org/abs/2309.14379)",
-		"[Törnberg, Petter. 2023. 'How to Use LLMs for Text Analysis.' arXiv:2307.13106.]"
-		"(https://arxiv.org/abs/2307.13106)"
-	]
+		"[Törnberg, Petter. 2023. 'How to Use LLMs for Text Analysis.' arXiv:2307.13106.]"]
 
 	config = {
 		"api.openai.api_key": {
 			"type": UserInput.OPTION_TEXT,
 			"default": "",
 			"help": "OpenAI API key",
-			"tooltip": "Can be created on platform.openapi.com"
+			"tooltip": "Can be created on <strong>platform.openapi.com</strong>"
 		}
 	}
 
 	@classmethod
 	def get_options(cls, parent_dataset=None, user=None):
 		options = {
+			"per_item": {
+				"type": UserInput.OPTION_INFO,
+				"help": "Outputs are generated per row in the parent dataset. Use [brackets] with a column name to "
+						"indicate where and what dataset value you want to use, e.g.: 'Determine the language of the "
+						"following text: [body]').",
+			},
+			"ethics_warning": {
+				"type": UserInput.OPTION_INFO,
+				"help": "<strong>Be very sensitive with running this processor on your datasets, as data will be "
+						"sent to OpenAI.</strong>"
+			},
+			"ethics_warning2": {
+				"type": UserInput.OPTION_INFO,
+				"help": "Always consider anonymising your data or choosing an open-source LLM host.</strong>"
+			},
+			"ethics_warning3": {
+				"type": UserInput.OPTION_INFO,
+				"help": "Before running a prompt on a large dataset, it is recommended to first create a sample and "
+						"test the prompt on a handful of rows. You can sample your dataset with the filter processors"
+						" on this page."
+			},
 			"model": {
 				"type": UserInput.OPTION_CHOICE,
 				"help": "Model",
@@ -53,10 +70,6 @@ def get_options(cls, parent_dataset=None, user=None):
 				},
 				"default": "gpt-4o-mini"
 			},
-			"per_item": {
-				"type": UserInput.OPTION_INFO,
-				"help": "Outputs are generated per row in the parent dataset. Use [brackets] with a column name to indicate where and what dataset value you want to use, e.g.: 'Determine the language of the following text: [body]')",
-			},
 			"prompt": {
 				"type": UserInput.OPTION_TEXT_LARGE,
 				"help": "Prompt"
@@ -64,17 +77,33 @@ def get_options(cls, parent_dataset=None, user=None):
 			"temperature": {
 				"type": UserInput.OPTION_TEXT,
 				"help": "Temperature",
-				"default": 0.5
+				"default": 0.5,
+				"tooltip": "The temperature hyperparameter indicates how strict the model will output the next "
+						   "predicted word with the highest probability. A score closer to 1 leads to more 'creative'"
+						   " outputs."
 			},
 			"max_tokens": {
 				"type": UserInput.OPTION_TEXT,
-				"help": "Max tokens",
-				"default": 50
+				"help": "Max output tokens",
+				"default": 50,
+				"tooltip": "As a rule of thumb, one token generally corresponds to ~4 characters of "
+						   "text for common English text."
 			},
 			"write_annotations": {
 				"type": UserInput.OPTION_TOGGLE,
 				"help": "Add output as annotations to the parent dataset.",
 				"default": True
+			},
+			"annotation_label": {
+				"type": UserInput.OPTION_TEXT,
+				"help": "Annotation label",
+				"default": "",
+				"requires": "write_annotations==true"
+			},
+			"consent": {
+				"type": UserInput.OPTION_TOGGLE,
+				"help": "I understand that my data is sent to OpenAI and that OpenAI may incur costs.",
+				"default": False,
 			}
 		}
 
@@ -84,7 +113,7 @@ def get_options(cls, parent_dataset=None, user=None):
 				"type": UserInput.OPTION_TEXT,
 				"default": "",
 				"help": "OpenAI API key",
-				"tooltip": "Can be created on platform.openapi.com"
+				"tooltip": "Can be created at platform.openapi.com"
 			}
 
 		return options
@@ -101,6 +130,10 @@ def is_compatible_with(cls, module=None, user=None):
 
 	def process(self):
 
+		consent = self.parameters.get("consent", False)
+		if not consent:
+			self.dataset.finish_with_error("You must consent to your data being sent to OpenAI first")
+
 		model = self.parameters.get("model")
 
 		api_key = self.parameters.get("api_key")
@@ -128,12 +161,17 @@ def process(self):
 			self.dataset.finish_with_error("You need to insert a valid prompt")
 			return
 
-		replacements = re.findall(r"\[(.*?)\]", base_prompt)
+		replacements = re.findall(r"\[.*?\]", base_prompt)
 		if not replacements:
 			self.dataset.finish_with_error("You need to provide the prompt with input values using [brackets] of "
-										   "column names.")
+										   "column names")
 
 		write_annotations = self.parameters.get("write_annotations", False)
+		if write_annotations:
+			label = self.parameters.get("annotation_label", "")
+			if not label:
+				label = model + " output"
+
 		annotations = []
 
 		results = []
@@ -147,11 +185,20 @@ def process(self):
 			# Replace with dataset values
 			prompt = base_prompt
 			for replacement in replacements:
-				prompt = prompt.replace(replacement, str(item[replacement]))
+				try:
+					field_name = str(item[replacement[1:-1]]).strip()
+					prompt = prompt.replace(replacement, field_name)
+				except KeyError as e:
+					self.dataset.finish_with_error("Field %s could not be found in the parent dataset" % str(e))
 
 			response = self.prompt_gpt(prompt, client, model=model, temperature=temperature, max_tokens=max_tokens)
 
-			item_id = item["id"] if "id" in item else item.get("item_id", "")
+			if "id" in item:
+				item_id = item["id"]
+			elif "item_id" in item:
+				item_id = item["item_id"]
+			else:
+				item_id = str(i)
 
 			response = response.choices[0].message.content
 			results.append({
@@ -160,10 +207,9 @@ def process(self):
 				model + " output": response
 			})
 
-			# todo: make this available for all datasets
-			if self.source_dataset.is_top_dataset() and write_annotations:
+			if write_annotations:
 				annotation = {
-					"label": model + " output",
+					"label": label,
 					"item_id": item_id,
 					"value": response,
 					"type": "textarea"
@@ -174,7 +220,7 @@ def process(self):
 			i += 1
 
 		# Write annotations
-		if self.source_dataset.is_top_dataset() and write_annotations:
+		if write_annotations:
 			self.write_annotations(annotations, overwrite=True)
 
 		# Write to csv file

From a75c64d6f5a00b777c99f48da4c4217bf1cfb06f Mon Sep 17 00:00:00 2001
From: Desktop Sal <info@salhagen.nl>
Date: Wed, 11 Sep 2024 16:14:44 +0200
Subject: [PATCH 191/204] Don't save annotations when no changes are made.

---
 webtool/static/js/explorer.js | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/webtool/static/js/explorer.js b/webtool/static/js/explorer.js
index b79401de6..a7889413b 100644
--- a/webtool/static/js/explorer.js
+++ b/webtool/static/js/explorer.js
@@ -24,7 +24,6 @@ function init() {
  */
 const annotations = {
 
-
 	init: function() {
 
 		let editor = $("#annotation-fields-editor");
@@ -99,7 +98,7 @@ const annotations = {
 
 		// Keep track of whether the annotations are edited or not.
 		let post_annotations = $(".post-annotations");
-		post_annotations.on("keydown click change",
+		post_annotations.on("keydown keyup change",
 							".post-annotation-input, input[type=checkbox], label, option",
 							function(){
 
@@ -109,7 +108,6 @@ const annotations = {
 				parent = parent.parent();
 			}
 			annotations.markChanges(parent);
-			annotations.startSaveTimer();
 		});
 
 		// Save the annotations to the database
@@ -648,6 +646,7 @@ const annotations = {
 		$(el).find(".annotation-author").html(current_username);
 		$(el).find(".epoch-timestamp-edited").html(current_date);
 		$(el).find(".timestamp-edited").html(getLocalTimeStr(current_date));
+		annotations.startSaveTimer();
 	}
 };
 

From ae0712bbe65f7df1501091cb8add8b47b90b4f24 Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Thu, 12 Sep 2024 18:35:42 +0200
Subject: [PATCH 192/204] Space out tweets better in Explorer template

---
 webtool/static/css/explorer/twitter.css | 1 +
 1 file changed, 1 insertion(+)

diff --git a/webtool/static/css/explorer/twitter.css b/webtool/static/css/explorer/twitter.css
index debdceb02..1af0811e4 100644
--- a/webtool/static/css/explorer/twitter.css
+++ b/webtool/static/css/explorer/twitter.css
@@ -20,6 +20,7 @@
 	position: relative;
 	display: table;
 	table-layout: fixed;
+	width: 100%;
 }
 
 .posts .post .post-table-row {

From 2bbb83c830c9dfb5a26154f306af50b95428aa63 Mon Sep 17 00:00:00 2001
From: Desktop Sal <info@salhagen.nl>
Date: Tue, 17 Sep 2024 11:01:51 +0200
Subject: [PATCH 193/204] No spacy

---
 setup.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/setup.py b/setup.py
index 1e2fe82f8..016a5612f 100644
--- a/setup.py
+++ b/setup.py
@@ -52,7 +52,6 @@
 	"scikit-learn",
 	"scipy==1.10.1",
 	"shapely",
-	"spacy==3.7.2",
 	"svgwrite~=1.4.0",
 	"tailer",
 	"Telethon~=1.36.0",
@@ -64,8 +63,7 @@
 	"imagedominantcolor @ git+https://github.com/dale-wahl/imagedominantcolor.git@pillow10",
 	"videohash @ git+https://github.com/dale-wahl/videohash@main",
 	"vk_api",
-	"yt-dlp",
-	"en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1.tar.gz#egg=en_core_web_sm"
+	"yt-dlp"
 ]
 
 # Some packages don't run on Windows

From 141b7afe506fef518d585c776a9b18931975501c Mon Sep 17 00:00:00 2001
From: Desktop Sal <info@salhagen.nl>
Date: Tue, 17 Sep 2024 11:10:31 +0200
Subject: [PATCH 194/204] Rename Explorer migrate script

---
 .../migrate/{migrate-1.45-1.46.py => migrate-1.46-1.47.py}        | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename helper-scripts/migrate/{migrate-1.45-1.46.py => migrate-1.46-1.47.py} (100%)

diff --git a/helper-scripts/migrate/migrate-1.45-1.46.py b/helper-scripts/migrate/migrate-1.46-1.47.py
similarity index 100%
rename from helper-scripts/migrate/migrate-1.45-1.46.py
rename to helper-scripts/migrate/migrate-1.46-1.47.py

From 09d1551c42b78f610ac53b136fe11c70ea6c81c0 Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Tue, 17 Sep 2024 17:41:44 +0200
Subject: [PATCH 195/204] Include openai library in setup

---
 setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.py b/setup.py
index 56f5acd16..9e98aad66 100644
--- a/setup.py
+++ b/setup.py
@@ -35,6 +35,7 @@
 	"networkx~=2.8.0",
 	"numpy>=1.19.2",
 	"opencv-python>=4.6.0.66",
+	"openai",
 	"packaging",
 	"pandas==1.5.3",
 	"Pillow>=10.3",

From b91468755400cc0f0cbc956c742a175de4f16f64 Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Tue, 17 Sep 2024 17:42:03 +0200
Subject: [PATCH 196/204] Fix merge errors in result-result-row.html

---
 webtool/templates/components/result-result-row.html | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/webtool/templates/components/result-result-row.html b/webtool/templates/components/result-result-row.html
index aded36055..04b11d8c8 100644
--- a/webtool/templates/components/result-result-row.html
+++ b/webtool/templates/components/result-result-row.html
@@ -45,8 +45,8 @@
     {% endif %}
     <li>
     {% if __user_config("privileges.can_use_explorer") %}
-        <a href="{{ url_for('explorer_dataset', key=dataset.key) }}" class="tooltip-trigger" aria-controls="tooltip-explore-items-{{ dataset.key }}">
-            <i class="fa fa-binoculars" aria-hidden="true"></i> Explore
+        <a href="{{ url_for('explorer_dataset', dataset_key=dataset.key) }}" class="tooltip-trigger" aria-controls="tooltip-explore-items-{{ dataset.key }}">
+            <i class="fa fa-binoculars" aria-hidden="true"></i> Explore & annotate
         </a>
         <p role="tooltip" id="tooltip-explore-items-{{ dataset.key }}" aria-hidden="true">Explore, sort, and add annotations to data interactively</p>
     {% endif %}

From 4d3cc1c1a9f1b23a0ef557ff87a0c5770f85c6ef Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Tue, 17 Sep 2024 17:42:18 +0200
Subject: [PATCH 197/204] Add option to add a custom (fine-tuned) model to GPT
 processor.

---
 processors/machine_learning/gpt.py | 73 ++++++++++++++++++++++--------
 1 file changed, 53 insertions(+), 20 deletions(-)

diff --git a/processors/machine_learning/gpt.py b/processors/machine_learning/gpt.py
index 6a194998d..0bfe375f3 100644
--- a/processors/machine_learning/gpt.py
+++ b/processors/machine_learning/gpt.py
@@ -45,16 +45,7 @@ def get_options(cls, parent_dataset=None, user=None):
 						"indicate where and what dataset value you want to use, e.g.: 'Determine the language of the "
 						"following text: [body]').",
 			},
-			"ethics_warning": {
-				"type": UserInput.OPTION_INFO,
-				"help": "<strong>Be very sensitive with running this processor on your datasets, as data will be "
-						"sent to OpenAI.</strong>"
-			},
-			"ethics_warning2": {
-				"type": UserInput.OPTION_INFO,
-				"help": "Always consider anonymising your data or choosing an open-source LLM host.</strong>"
-			},
-			"ethics_warning3": {
+			"ethics_warning1": {
 				"type": UserInput.OPTION_INFO,
 				"help": "Before running a prompt on a large dataset, it is recommended to first create a sample and "
 						"test the prompt on a handful of rows. You can sample your dataset with the filter processors"
@@ -67,9 +58,24 @@ def get_options(cls, parent_dataset=None, user=None):
 					"gpt-4o-mini": "GPT-4o mini",
 					"gpt-4o": "GPT-4o",
 					"gpt-4-turbo": "GPT-4 turbo",
+					"o1-mini": "o1-mini",
+					"custom": "Custom (fine-tuned) model"
 				},
 				"default": "gpt-4o-mini"
 			},
+			"custom_model_info": {
+				"type": UserInput.OPTION_INFO,
+				"requires": "model==custom",
+				"help": "[You can fine-tune a model on the OpenAI portal to improve your prompt results]("
+						"https://platform.openai.com/docs/guides/fine-tuning). With fine-tuned models, examples in the "
+						"prompt ('few-shot learning') may not be necessary anymore."
+			},
+			"custom_model": {
+				"type": UserInput.OPTION_TEXT,
+				"help": "Model ID",
+				"requires": "model==custom",
+				"tooltip": "In the format <strong>ft:[modelname]:[org_id]:[custom_suffix]:[id]</strong>. See the link above."
+			},
 			"prompt": {
 				"type": UserInput.OPTION_TEXT_LARGE,
 				"help": "Prompt"
@@ -89,16 +95,14 @@ def get_options(cls, parent_dataset=None, user=None):
 				"tooltip": "As a rule of thumb, one token generally corresponds to ~4 characters of "
 						   "text for common English text."
 			},
-			"write_annotations": {
-				"type": UserInput.OPTION_TOGGLE,
-				"help": "Add output as annotations to the parent dataset.",
-				"default": True
+			"ethics_warning2": {
+				"type": UserInput.OPTION_INFO,
+				"help": "<strong>Be very sensitive with running this processor on your datasets, as data will be "
+						"sent to OpenAI.</strong>"
 			},
-			"annotation_label": {
-				"type": UserInput.OPTION_TEXT,
-				"help": "Annotation label",
-				"default": "",
-				"requires": "write_annotations==true"
+			"ethics_warning3": {
+				"type": UserInput.OPTION_INFO,
+				"help": "Always consider anonymising your data or choosing an open-source LLM host.</strong>"
 			},
 			"consent": {
 				"type": UserInput.OPTION_TOGGLE,
@@ -107,6 +111,21 @@ def get_options(cls, parent_dataset=None, user=None):
 			}
 		}
 
+		# Allow adding prompt answers as annotations to the top-level dataset
+		# if this is a direct child
+		if parent_dataset and parent_dataset.is_top_dataset():
+			options["write_annotations"] = {
+					"type": UserInput.OPTION_TOGGLE,
+					"help": "Add output as annotations to the parent dataset.",
+					"default": True
+			}
+			options["annotation_label"] = {
+					"type": UserInput.OPTION_TEXT,
+					"help": "Annotation label",
+					"default": "",
+					"requires": "write_annotations==true"
+			}
+
 		api_key = config.get("api.openai.api_key", user=user)
 		if not api_key:
 			options["api_key"] = {
@@ -135,6 +154,13 @@ def process(self):
 			self.dataset.finish_with_error("You must consent to your data being sent to OpenAI first")
 
 		model = self.parameters.get("model")
+		if model == "custom":
+			if not self.parameters.get("custom_model", ""):
+				self.dataset.finish_with_error("You must provide a valid ID for your custom model")
+			else:
+				custom_model_id = self.parameters.get("custom_model", "")
+				self.parameters["model"] = custom_model_id
+				model = custom_model_id
 
 		api_key = self.parameters.get("api_key")
 		if not api_key:
@@ -191,7 +217,14 @@ def process(self):
 				except KeyError as e:
 					self.dataset.finish_with_error("Field %s could not be found in the parent dataset" % str(e))
 
-			response = self.prompt_gpt(prompt, client, model=model, temperature=temperature, max_tokens=max_tokens)
+			try:
+				response = self.prompt_gpt(prompt, client, model=model, temperature=temperature, max_tokens=max_tokens)
+			except openai.NotFoundError as e:
+				self.dataset.finish_with_error(e.message)
+				return 0
+			except openai.BadRequestError as e:
+				self.dataset.finish_with_error(e.message)
+				return 0
 
 			if "id" in item:
 				item_id = item["id"]

From e07d74811d06e016219feaa933c119548b60a5ff Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Wed, 18 Sep 2024 17:47:01 +0200
Subject: [PATCH 198/204] Change wording and compatibility of Annotation
 metadata processor

---
 processors/metrics/annotation_metadata.py | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/processors/metrics/annotation_metadata.py b/processors/metrics/annotation_metadata.py
index 93b110e35..8f0b3447e 100644
--- a/processors/metrics/annotation_metadata.py
+++ b/processors/metrics/annotation_metadata.py
@@ -11,12 +11,21 @@ class AnnotationMetadata(BasicProcessor):
 	Download annotation metadata from this dataset
 	"""
 	type = "annotation-metadata"  # job type ID
-	category = "Annotations"  # category
+	category = "Post metrics"  # category
 	title = "Annotation metadata"  # title displayed in UI
-	description =	"Download all the metadata about annotations for this dataset. " \
-					"Includes information like who made the annotation, when it was " \
-					"last edited, etcetera." # description displayed in UI
-	extension = 'csv'  # extension of result file, used internally and in UI
+	description = ("Download metadata about annotations for this dataset. "
+				   "Includes information like annotation author, timestamp, type, etc.") # description displayed in UI
+	extension = "csv"  # extension of result file, used internally and in UI
+
+	@classmethod
+	def is_compatible_with(cls, module=None, user=None):
+		"""
+		Only compatible with datasets that have annotations.
+
+		:param module: Module to determine compatibility with
+		"""
+
+		return module.is_dataset() and module.has_annotations()
 
 	def process(self):
 

From fa4d23661c8be98131f158a5ea9ab0b42d29c55a Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Wed, 18 Sep 2024 17:50:42 +0200
Subject: [PATCH 199/204] Change settings and wording for OpenAI LLM processor

---
 processors/machine_learning/gpt.py | 27 +++++++++++++++------------
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/processors/machine_learning/gpt.py b/processors/machine_learning/gpt.py
index 0bfe375f3..93149b132 100644
--- a/processors/machine_learning/gpt.py
+++ b/processors/machine_learning/gpt.py
@@ -10,14 +10,14 @@
 from backend.lib.processor import BasicProcessor
 from common.config_manager import config
 
-class GPT(BasicProcessor):
+class OpenAI(BasicProcessor):
 	"""
 	Prompt OpenAI's GPT models
 	"""
-	type = "gpt"  # job type ID
+	type = "openai-llms"  # job type ID
 	category = "Machine learning"  # category
-	title = "GPT prompting"  # title displayed in UI
-	description = ("Use OpenAI's GPT LLMs to generate outputs based on the parent dataset.") # description displayed in UI
+	title = "OpenAI LLMs"  # title displayed in UI
+	description = ("Use OpenAI's LLMs (e.g. GPT-4) to generate outputs based on the parent dataset.") # description displayed in UI
 	extension = "csv"  # extension of result file, used internally and in UI. In this case it's variable!
 
 	references = [
@@ -25,7 +25,7 @@ class GPT(BasicProcessor):
 		"[Karjus, Andres. 2023. 'Machine-assisted mixed methods: augmenting humanities and social sciences "
 		"with artificial intelligence.' arXiv preprint arXiv:2309.14379.]"
 		"(https://arxiv.org/abs/2309.14379)",
-		"[Törnberg, Petter. 2023. 'How to Use LLMs for Text Analysis.' arXiv:2307.13106.]"]
+		"[Törnberg, Petter. 2023. 'How to Use LLMs for Text Analysis.' arXiv:2307.13106.](https://arxiv.org/pdf/2307.13106)"]
 
 	config = {
 		"api.openai.api_key": {
@@ -74,19 +74,20 @@ def get_options(cls, parent_dataset=None, user=None):
 				"type": UserInput.OPTION_TEXT,
 				"help": "Model ID",
 				"requires": "model==custom",
-				"tooltip": "In the format <strong>ft:[modelname]:[org_id]:[custom_suffix]:[id]</strong>. See the link above."
+				"tooltip": "In the format ft:[modelname]:[org_id]:[custom_suffix]:[id]. See link above"
 			},
 			"prompt": {
 				"type": UserInput.OPTION_TEXT_LARGE,
-				"help": "Prompt"
+				"help": "Prompt",
+				"tooltip": "See the academic references for this processor on best practices for LLM prompts"
 			},
 			"temperature": {
 				"type": UserInput.OPTION_TEXT,
 				"help": "Temperature",
 				"default": 0.5,
-				"tooltip": "The temperature hyperparameter indicates how strict the model will output the next "
-						   "predicted word with the highest probability. A score closer to 1 leads to more 'creative'"
-						   " outputs."
+				"tooltip": "The temperature hyperparameter indicates how strict the model will gravitate towards the next "
+						   "predicted word with the highest probability. A score close to 0 returns more predictable "
+						   "outputs while a score close to 1 leads to more creative outputs."
 			},
 			"max_tokens": {
 				"type": UserInput.OPTION_TEXT,
@@ -102,7 +103,7 @@ def get_options(cls, parent_dataset=None, user=None):
 			},
 			"ethics_warning3": {
 				"type": UserInput.OPTION_INFO,
-				"help": "Always consider anonymising your data or choosing an open-source LLM host.</strong>"
+				"help": "<strong>Always consider anonymising your data or choosing an open-source LLM host.</strong>"
 			},
 			"consent": {
 				"type": UserInput.OPTION_TOGGLE,
@@ -132,7 +133,7 @@ def get_options(cls, parent_dataset=None, user=None):
 				"type": UserInput.OPTION_TEXT,
 				"default": "",
 				"help": "OpenAI API key",
-				"tooltip": "Can be created at platform.openapi.com"
+				"tooltip": "Can be created on platform.openapi.com"
 			}
 
 		return options
@@ -152,6 +153,7 @@ def process(self):
 		consent = self.parameters.get("consent", False)
 		if not consent:
 			self.dataset.finish_with_error("You must consent to your data being sent to OpenAI first")
+		self.dataset.delete_parameter("consent")
 
 		model = self.parameters.get("model")
 		if model == "custom":
@@ -182,6 +184,7 @@ def process(self):
 		self.dataset.delete_parameter("api_key")  # sensitive, delete after use
 
 		base_prompt = self.parameters.get("prompt", "")
+		self.dataset.update_status("Prompt: %s" % base_prompt)
 
 		if not base_prompt:
 			self.dataset.finish_with_error("You need to insert a valid prompt")

From 754b3391a12e671a08102600dbf1ce217ebfeb4b Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Wed, 18 Sep 2024 18:06:59 +0200
Subject: [PATCH 200/204] Allow Google API key in config

---
 common/lib/config_definition.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/common/lib/config_definition.py b/common/lib/config_definition.py
index 778c0ad9c..0c0ab08ea 100644
--- a/common/lib/config_definition.py
+++ b/common/lib/config_definition.py
@@ -59,6 +59,13 @@
                    "software, and a 'powered by 4CAT' notice may also show up in the web interface regardless of the "
                    "value entered here."
     },
+    "4cat.about_this_server": {
+        "type": UserInput.OPTION_TEXT_LARGE,
+        "default": "",
+        "help": "Server information",
+        "tooltip": "Custom server information that is displayed on the 'About' page. Can for instance be used to show "
+                   "information about who maintains the tool or what its intended purpose is."
+    },
     "4cat.crash_message": {
         "type": UserInput.OPTION_TEXT_LARGE,
         "default": "This processor has crashed; the crash has been logged. 4CAT will try again when it is restarted. "

From e37db4faf0d043b32eaa0c0bf60720e1ef6eb700 Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Wed, 18 Sep 2024 18:07:07 +0200
Subject: [PATCH 201/204] Better error handling for Perspective processor

---
 processors/machine_learning/perspective.py | 87 ++++++++++++++--------
 1 file changed, 54 insertions(+), 33 deletions(-)

diff --git a/processors/machine_learning/perspective.py b/processors/machine_learning/perspective.py
index d09ed2359..996cec91b 100644
--- a/processors/machine_learning/perspective.py
+++ b/processors/machine_learning/perspective.py
@@ -9,6 +9,7 @@
 from backend.lib.processor import BasicProcessor
 from googleapiclient import discovery
 from common.lib.item_mapping import MappedItem
+from common.config_manager import config
 
 class Perspective(BasicProcessor):
 	"""
@@ -28,39 +29,55 @@ class Perspective(BasicProcessor):
 		"(https://doi.org/10.1177/20539517211046181)"
 	]
 
-	options = {
-		"api_key": {
-            "type": UserInput.OPTION_TEXT,
-            "help": "API Key",
-            "tooltip": "The API Key for the Google API account you want to query with. You can generate and find this "
-                       "key on the API dashboard.",
-			"sensitive": True
-        },
-        "attributes": {
-            "type": UserInput.OPTION_MULTI,
-            "help": "Attributes to score",
-            "options": {
-                "TOXICITY": "Toxicity",
-                "SEVERE_TOXICITY": "Severe toxicity",
-                "IDENTITY_ATTACK": "Identity attack",
-                "INSULT": "Insult",
-                "PROFANITY": "Profanity",
-                "THREAT": "Threat"
-            },
-            "default": ["TOXICITY"]
-        },
-		"write_annotations": {
-			"type": UserInput.OPTION_TOGGLE,
-			"help": "Add attribute scores as annotations to the parent dataset.",
-			"default": True
+	config = {
+		"api.google.api_key": {
+			"type": UserInput.OPTION_TEXT,
+			"default": "",
+			"help": "Google API key",
+			"tooltip": "Can be created on console.cloud.google.com"
 		}
 	}
 
+	@classmethod
+	def get_options(cls, parent_dataset=None, user=None):
+		options = {
+			"attributes": {
+				"type": UserInput.OPTION_MULTI,
+				"help": "Attributes to score",
+				"options": {
+					"TOXICITY": "Toxicity",
+					"SEVERE_TOXICITY": "Severe toxicity",
+					"IDENTITY_ATTACK": "Identity attack",
+					"INSULT": "Insult",
+					"PROFANITY": "Profanity",
+					"THREAT": "Threat"
+				},
+				"default": ["TOXICITY"]
+			},
+			"write_annotations": {
+				"type": UserInput.OPTION_TOGGLE,
+				"help": "Add attribute scores as annotations to the parent dataset.",
+				"default": True
+			}
+		}
+
+		api_key = config.get("api.google.api_key", user=user)
+		if not api_key:
+			options["api_key"] = {
+				"type": UserInput.OPTION_TEXT,
+				"default": "",
+				"help": "Google API key",
+				"tooltip": "Can be created on console.cloud.google.com"
+			}
+
+		return options
+
 	def process(self):
 
 		api_key = self.parameters.get("api_key")
 		self.dataset.delete_parameter("api_key")  # sensitive, delete after use
-
+		if not api_key:
+			api_key = config.get("api.google.api_key", user=self.owner)
 		if not api_key:
 			self.dataset.finish_with_error("You need to provide a valid API key")
 			return
@@ -71,13 +88,17 @@ def process(self):
 
 		write_annotations = self.parameters.get("api_key", True)
 
-		client = discovery.build(
-			"commentanalyzer",
-			"v1alpha1",
-			developerKey=api_key,
-			discoveryServiceUrl="https://commentanalyzer.googleapis.com/$discovery/rest?version=v1alpha1",
-			static_discovery=False,
-		)
+		try:
+			client = discovery.build(
+				"commentanalyzer",
+				"v1alpha1",
+				developerKey=api_key,
+				discoveryServiceUrl="https://commentanalyzer.googleapis.com/$discovery/rest?version=v1alpha1",
+				static_discovery=False,
+			)
+		except HttpError as e:
+			error = json.loads(e.content)["error"]["message"]
+			self.dataset.finish_with_error(error)
 
 		results = []
 		annotations = []

From c6f587a7fe04aee87a84b638409b1c2528eafcac Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Wed, 18 Sep 2024 18:07:45 +0200
Subject: [PATCH 202/204] Strong no longer

---
 processors/machine_learning/gpt.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/processors/machine_learning/gpt.py b/processors/machine_learning/gpt.py
index 93149b132..d37be4e49 100644
--- a/processors/machine_learning/gpt.py
+++ b/processors/machine_learning/gpt.py
@@ -32,7 +32,7 @@ class OpenAI(BasicProcessor):
 			"type": UserInput.OPTION_TEXT,
 			"default": "",
 			"help": "OpenAI API key",
-			"tooltip": "Can be created on <strong>platform.openapi.com</strong>"
+			"tooltip": "Can be created on platform.openapi.com"
 		}
 	}
 

From f4b108e724e1c317108e73d30195276608e012f3 Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Tue, 1 Oct 2024 19:04:36 +0200
Subject: [PATCH 203/204] Get rid of some unncessary code in Perspective
 processor

---
 processors/machine_learning/perspective.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/processors/machine_learning/perspective.py b/processors/machine_learning/perspective.py
index 996cec91b..7dacdf8e2 100644
--- a/processors/machine_learning/perspective.py
+++ b/processors/machine_learning/perspective.py
@@ -146,10 +146,6 @@ def process(self):
 	@staticmethod
 	def map_item(item):
 
-		# print({"item_id": item["id"],
-		# 	"body": item.get("body"),
-		# 	**attribute_scores})
-
 		attribute_scores = {}
 		all_attributes = ["TOXICITY", "SEVERE_TOXICITY", "IDENTITY_ATTACK", "INSULT", "PROFANITY", "THREAT"]
 		for att in all_attributes:

From b90bb3eefeae340c6666b12c9e27a10956e6fe64 Mon Sep 17 00:00:00 2001
From: Sal Hagen <s.h.hagen@uva.nl>
Date: Tue, 15 Oct 2024 11:50:31 +0200
Subject: [PATCH 204/204] Fix merge issues in Explorer API and Telegram search

---
 datasources/telegram/search_telegram.py | 191 +++++--
 webtool/views/api_explorer.py           | 727 ------------------------
 webtool/views/views_explorer.py         |   6 +-
 3 files changed, 153 insertions(+), 771 deletions(-)
 delete mode 100644 webtool/views/api_explorer.py

diff --git a/datasources/telegram/search_telegram.py b/datasources/telegram/search_telegram.py
index a8406797a..aa2f5bca1 100644
--- a/datasources/telegram/search_telegram.py
+++ b/datasources/telegram/search_telegram.py
@@ -6,6 +6,7 @@
 import hashlib
 import asyncio
 import json
+import ural
 import time
 import re
 
@@ -24,7 +25,7 @@
     FloodWaitError, ApiIdInvalidError, PhoneNumberInvalidError, RPCError
 from telethon.tl.functions.channels import GetFullChannelRequest
 from telethon.tl.functions.users import GetFullUserRequest
-from telethon.tl.types import User
+from telethon.tl.types import User, MessageEntityMention
 
 
 
@@ -214,6 +215,14 @@ def get_options(cls, parent_dataset=None, user=None):
                 "tooltip": "Entities need to be references at least this many times to be added to the query. Only "
                            "references discovered below the max crawl depth are taken into account."
             }
+            options["crawl-via-links"] = {
+                "type": UserInput.OPTION_TOGGLE,
+                "default": False,
+                "help": "Extract new groups from links",
+                "tooltip": "Look for references to other groups in message content via t.me links and @references. "
+                           "This is more error-prone than crawling only via forwards, but can be a way to discover "
+                           "links that would otherwise remain undetected."
+            }
 
         return options
 
@@ -234,6 +243,7 @@ def get_items(self, query):
 
         self.details_cache = {}
         self.failures_cache = set()
+        #TODO: This ought to yield as we're holding everything in memory; async generator? execute_queries() also needs to be modified for this
         results = asyncio.run(self.execute_queries())
 
         if not query.get("save-session"):
@@ -326,9 +336,10 @@ async def execute_queries(self):
         except Exception as e:
             # catch-all so we can disconnect properly
             # ...should we?
-            self.dataset.update_status("Error scraping posts from Telegram")
-            self.log.error(f"Telegram scraping error: {traceback.format_exc()}")
-            return []
+            self.dataset.update_status("Error scraping posts from Telegram; halting collection.")
+            self.log.error(f"Telegram scraping error (dataset {self.dataset.key}): {traceback.format_exc()}")
+            # May as well return what was captured, yes?
+            return posts
         finally:
             await client.disconnect()
 
@@ -356,6 +367,7 @@ async def gather_posts(self, client, queries, max_items, min_date, max_date):
 
         crawl_max_depth = self.parameters.get("crawl-depth", 0)
         crawl_msg_threshold = self.parameters.get("crawl-threshold", 10)
+        crawl_via_links = self.parameters.get("crawl-via-links", False)
 
         self.dataset.log(f"Max crawl depth: {crawl_max_depth}")
         self.dataset.log(f"Crawl threshold: {crawl_msg_threshold}")
@@ -364,12 +376,13 @@ async def gather_posts(self, client, queries, max_items, min_date, max_date):
         # has been mentioned. When crawling is enabled and this exceeds the
         # given threshold, the entity is added to the query
         crawl_references = {}
-        queried_entities = list(queries)
-        full_query = list(queries)
+        full_query = set(queries)
+        num_queries = len(queries)
 
         # we may not always know the 'entity username' for an entity ID, so
         # keep a reference map as we go
         entity_id_map = {}
+        query_id_map= {}
 
         # Collect queries
         # Use while instead of for so we can change queries during iteration
@@ -383,17 +396,18 @@ async def gather_posts(self, client, queries, max_items, min_date, max_date):
             delay = 10
             retries = 0
             processed += 1
-            self.dataset.update_progress(processed / len(full_query))
+            self.dataset.update_progress(processed / num_queries)
 
             if no_additional_queries:
-                # Note that we are note completing this query
+                # Note that we are not completing this query
                 self.dataset.update_status(f"Rate-limited by Telegram; not executing query {entity_id_map.get(query, query)}")
                 continue
 
             while True:
                 self.dataset.update_status(f"Retrieving messages for entity '{entity_id_map.get(query, query)}'")
+                entity_posts = 0
+                discovered = 0
                 try:
-                    entity_posts = 0
                     async for message in client.iter_messages(entity=query, offset_date=max_date):
                         entity_posts += 1
                         total_messages += 1
@@ -413,11 +427,14 @@ async def gather_posts(self, client, queries, max_items, min_date, max_date):
                         # the channel a message was forwarded from (but that
                         # needs extra API requests...)
                         serialized_message = SearchTelegram.serialize_obj(message)
-                        if "_chat" in serialized_message and query not in entity_id_map and serialized_message["_chat"]["id"] == query:
-                            # once we know what a channel ID resolves to, use the username instead so it is easier to
-                            # understand for the user
-                            entity_id_map[query] = serialized_message["_chat"]["username"]
-                            self.dataset.update_status(f"Fetching messages for entity '{entity_id_map[query]}' (channel ID {query})")
+                        if "_chat" in serialized_message:
+                            # Add query ID to check if queries have been crawled previously
+                            full_query.add(serialized_message["_chat"]["id"])
+                            if query not in entity_id_map and serialized_message["_chat"]["id"] == query:
+                                # once we know what a channel ID resolves to, use the username instead so it is easier to
+                                # understand for the user
+                                entity_id_map[query] = serialized_message["_chat"]["username"]
+                                self.dataset.update_status(f"Fetching messages for entity '{entity_id_map[query]}' (channel ID {query})")
 
                         if resolve_refs:
                             serialized_message = await self.resolve_groups(client, serialized_message)
@@ -427,29 +444,85 @@ async def gather_posts(self, client, queries, max_items, min_date, max_date):
                             break
 
                         # if crawling is enabled, see if we found something to add to the query
-                        if crawl_max_depth and (not crawl_msg_threshold or depth_map.get(query) < crawl_msg_threshold):
+                        linked_entities = set()
+                        if crawl_max_depth and (depth_map.get(query) < crawl_max_depth):
                             message_fwd = serialized_message.get("fwd_from")
                             fwd_from = None
-                            if message_fwd and message_fwd["from_id"] and message_fwd["from_id"].get("_type") == "PeerChannel":
-                                # even if we haven't resolved the ID, we can feed the numeric ID
-                                # to Telethon! this is nice because it means we don't have to
-                                # resolve entities to crawl iteratively
-                                fwd_from = int(message_fwd["from_id"]["channel_id"])
-
-                            if fwd_from and fwd_from not in queried_entities and fwd_from not in queries:
-                                # new entity discovered!
-                                # might be discovered (before collection) multiple times, so retain lowest depth
-                                depth_map[fwd_from] = min(depth_map.get(fwd_from, crawl_max_depth), depth_map[query] + 1)
-                                if depth_map[query] < crawl_max_depth:
-                                    if fwd_from not in crawl_references:
-                                        crawl_references[fwd_from] = 0
-
-                                    crawl_references[fwd_from] += 1
-                                    if crawl_references[fwd_from] >= crawl_msg_threshold and fwd_from not in queries:
-                                        queries.append(fwd_from)
-                                        full_query.append(fwd_from)
-                                        self.dataset.update_status(f"Discovered new entity {entity_id_map.get(fwd_from, fwd_from)} in {entity_id_map.get(query, query)} at crawl depth {depth_map[query]}, adding to query")
-
+                            fwd_source_type = None
+                            if message_fwd and message_fwd.get("from_id"):
+                                if message_fwd["from_id"].get("_type") == "PeerChannel":
+                                    # Legacy(?) data structure (pre 2024/7/22)
+                                    # even if we haven't resolved the ID, we can feed the numeric ID
+                                    # to Telethon! this is nice because it means we don't have to
+                                    # resolve entities to crawl iteratively
+                                    fwd_from = int(message_fwd["from_id"]["channel_id"])
+                                    fwd_source_type = "channel"
+                                elif message_fwd and message_fwd.get("from_id", {}).get('full_chat',{}):
+                                    # TODO: do we need a check here to only follow certain types of messages? this is similar to resolving, but the types do not appear the same to me
+                                    # Note: message_fwd["from_id"]["channel_id"] == message_fwd["from_id"]["full_chat"]["id"] in test cases so far
+                                    fwd_from = int(message_fwd["from_id"]["full_chat"]["id"])
+                                    fwd_source_type = "channel"
+                                elif message_fwd and (message_fwd.get("from_id", {}).get('full_user',{}) or message_fwd.get("from_id", {}).get("_type") == "PeerUser"):
+                                    # forwards can also come from users
+                                    # these can never be followed, so don't add these to the crawl, but do document them
+                                    fwd_source_type = "user"
+                                else:
+                                    print(json.dumps(message_fwd))
+                                    self.log.warning(f"Telegram (dataset {self.dataset.key}): Unknown fwd_from data structure; unable to crawl")
+                                    fwd_source_type = "unknown"
+
+                                if fwd_from:
+                                    linked_entities.add(fwd_from)
+
+
+                            if crawl_via_links:
+                                # t.me links
+                                all_links = ural.urls_from_text(serialized_message["message"])
+                                all_links = [link.split("t.me/")[1] for link in all_links if ural.get_hostname(link) == "t.me" and len(link.split("t.me/")) > 1]
+                                for link in all_links:
+                                    if link.startswith("+"):
+                                        # invite links
+                                        continue
+
+                                    entity_name = link.split("/")[0].split("?")[0].split("#")[0]
+                                    linked_entities.add(entity_name)
+
+                                # @references
+                                references = [r for t, r in message.get_entities_text() if type(t) is MessageEntityMention]
+                                for reference in references:
+                                    if reference.startswith("@"):
+                                        reference = reference[1:]
+
+                                    reference = reference.split("/")[0]
+
+                                    linked_entities.add(reference)
+
+                            # Check if fwd_from or the resolved entity ID is already queued or has been queried
+                            for link in linked_entities:
+                                if link not in full_query and link not in queries and fwd_source_type not in ("user",):
+                                    # new entity discovered!
+                                    # might be discovered (before collection) multiple times, so retain lowest depth
+                                    # print(f"Potentially crawling {link}")
+                                    depth_map[link] = min(depth_map.get(link, crawl_max_depth), depth_map[query] + 1)
+                                    if link not in crawl_references:
+                                        crawl_references[link] = 0
+                                    crawl_references[link] += 1
+
+                                    # Add to queries if it has been referenced enough times
+                                    if crawl_references[link] >= crawl_msg_threshold:
+                                        queries.append(link)
+                                        full_query.add(link)
+                                        num_queries += 1
+                                        discovered += 1
+                                        self.dataset.update_status(f"Discovered new entity {entity_id_map.get(link, link)} in {entity_id_map.get(query, query)} at crawl depth {depth_map[query]}, adding to query")
+
+
+
+                        serialized_message["4CAT_metadata"] = {
+                            "collected_at": datetime.now().isoformat(), # this is relevant for rather long crawls
+                            "query": query, # possibly redundant, but we are adding non-user defined queries by crawling and may be useful to know exactly what query was used to collect an entity
+                            "query_depth": depth_map.get(query, 0)
+                        }
                         yield serialized_message
 
                         if entity_posts >= max_items:
@@ -502,6 +575,7 @@ async def gather_posts(self, client, queries, max_items, min_date, max_date):
                     delay *= 2
                     continue
 
+                self.dataset.log(f"Completed {entity_id_map.get(query, query)} with {entity_posts} messages (discovered {discovered} new entities)")
                 break
 
     async def resolve_groups(self, client, message):
@@ -703,6 +777,9 @@ def map_item(message):
             if from_data and from_data.get("from_name"):
                 forwarded_name = message["fwd_from"]["from_name"]
 
+            if from_data and from_data.get("users") and len(from_data["users"]) > 0 and "user" not in from_data:
+                from_data["user"] = from_data["users"][0]
+
             if from_data and ("user" in from_data or "chats" in from_data):
                 # 'resolve entities' was enabled for this dataset
                 if "user" in from_data:
@@ -745,11 +822,42 @@ def map_item(message):
 
         is_reply = False
         reply_to = ""
-
         if message.get("reply_to"):
             is_reply = True
             reply_to = message["reply_to"].get("reply_to_msg_id", "")
 
+        # t.me links
+        linked_entities = set()
+        all_links = ural.urls_from_text(message["message"])
+        all_links = [link.split("t.me/")[1] for link in all_links if
+                     ural.get_hostname(link) == "t.me" and len(link.split("t.me/")) > 1]
+
+        for link in all_links:
+            if link.startswith("+"):
+                # invite links
+                continue
+
+            entity_name = link.split("/")[0].split("?")[0].split("#")[0]
+            linked_entities.add(entity_name)
+
+        # @references
+        # in execute_queries we use MessageEntityMention to get these
+        # however, after serializing these objects we only have the offsets of
+        # the mentioned username, and telegram does weird unicode things to its
+        # offsets meaning we can't just substring the message. So use a regex
+        # as a 'good enough' solution
+        all_mentions = set(re.findall(r"@([^\s\W]+)", message["message"]))
+
+        # make this case-insensitive since people may use different casing in
+        # messages than the 'official' username for example
+        all_connections = set([v for v in [forwarded_username, *linked_entities, *all_mentions] if v])
+        all_ci_connections = set()
+        seen = set()
+        for connection in all_connections:
+            if connection.lower() not in seen:
+                all_ci_connections.add(connection)
+                seen.add(connection.lower())
+
         return MappedItem({
             "id": f"{message['_chat']['username']}-{message['id']}",
             "thread_id": thread,
@@ -759,11 +867,11 @@ def map_item(message):
             "author_name": fullname,
             "author_is_bot": "yes" if user_is_bot else "no",
             "body": message["message"],
-            "body_markdown": message["message_markdown"], 
+            "body_markdown": message["message_markdown"],
             "is_reply": is_reply,
             "reply_to": reply_to,
             "views": message["views"] if message["views"] else "",
-            "forwards": message.get("forwards", MissingMappedField(0)),
+            # "forwards": message.get("forwards", MissingMappedField(0)),
             "reactions": reactions,
             "timestamp": datetime.fromtimestamp(message["date"]).strftime("%Y-%m-%d %H:%M:%S"),
             "unix_timestamp": int(message["date"]),
@@ -773,6 +881,9 @@ def map_item(message):
             "author_forwarded_from_name": forwarded_name,
             "author_forwarded_from_username": forwarded_username,
             "author_forwarded_from_id": forwarded_id,
+            "entities_linked": ",".join(linked_entities),
+            "entities_mentioned": ",".join(all_mentions),
+            "all_connections": ",".join(all_ci_connections),
             "timestamp_forwarded_from": datetime.fromtimestamp(forwarded_timestamp).strftime(
                 "%Y-%m-%d %H:%M:%S") if forwarded_timestamp else "",
             "unix_timestamp_forwarded_from": forwarded_timestamp,
@@ -989,7 +1100,6 @@ def validate_query(query, request, user):
         return {
             "items": num_items,
             "query": ",".join(sanitized_items),
-            "board": "",  # needed for web interface
             "api_id": query.get("api_id"),
             "api_hash": query.get("api_hash"),
             "api_phone": query.get("api_phone"),
@@ -998,7 +1108,8 @@ def validate_query(query, request, user):
             "min_date": min_date,
             "max_date": max_date,
             "crawl-depth": query.get("crawl-depth"),
-            "crawl-threshold": query.get("crawl-threshold")
+            "crawl-threshold": query.get("crawl-threshold"),
+            "crawl-via-links": query.get("crawl-via-links")
         }
 
     @staticmethod
diff --git a/webtool/views/api_explorer.py b/webtool/views/api_explorer.py
deleted file mode 100644
index 721674afb..000000000
--- a/webtool/views/api_explorer.py
+++ /dev/null
@@ -1,727 +0,0 @@
-"""
-4CAT Data API - endpoints to get post and thread data from
-"""
-
-import json
-import csv
-import re
-import markdown2
-
-
-from pathlib import Path
-
-from flask import jsonify, abort, send_file, request, render_template
-from flask_login import login_required, current_user
-
-from webtool import app, db, openapi, limiter, config, fourcat_modules
-from webtool.lib.helpers import format_chan_post, error, setting_required
-from common.lib.dataset import DataSet
-from common.lib.helpers import strip_tags
-from common.lib.exceptions import DataSetException
-
-from common.config_manager import ConfigWrapper
-config = ConfigWrapper(config, user=current_user, request=request)
-api_ratelimit = limiter.shared_limit("45 per minute", scope="api")
-
-@app.route('/explorer/dataset/<string:key>/', defaults={'page': 0})
-@app.route('/explorer/dataset/<string:key>/<int:page>')
-@api_ratelimit
-@login_required
-@setting_required("privileges.can_use_explorer")
-@openapi.endpoint("explorer")
-def explorer_dataset(key, page):
-	"""
-	Show posts from a specific dataset
-
-	:param str dataset_key:  Dataset key
-
-	:return-schema: {type=array,items={type=integer}}
-
-	:return-error 404: If the dataset does not exist.
-	"""
-
-	# Get dataset info.
-	try:
-		dataset = DataSet(key=key, db=db, modules=fourcat_modules)
-	except DataSetException:
-		return error(404, error="Dataset not found.")
-
-	if dataset.is_private and not (config.get("privileges.can_view_all_datasets") or dataset.is_accessible_by(current_user)):
-		return error(403, error="This dataset is private.")
-
-	if len(dataset.get_genealogy()) > 1:
-		return error(404, error="Exporer only available for top-level datasets")
-
-	results_path = dataset.check_dataset_finished()
-	if not results_path:
-		return error(404, error="This dataset didn't finish executing (yet)")
-
-	# The amount of posts to show on a page
-	limit = config.get("explorer.posts_per_page", 50)
-
-	# The amount of posts that may be included (limit for large datasets)
-	max_posts = config.get('explorer.max_posts', 500000)
-
-	# The offset for posts depending on the current page
-	offset = ((page - 1) * limit) if page else 0
-
-	# Load some variables
-	parameters = dataset.get_parameters()
-	datasource = parameters["datasource"]
-	board = parameters.get("board", "")
-	post_count = int(dataset.data["num_rows"])
-	annotation_fields = dataset.get_annotation_fields()
-
-	# If the dataset is local, we can add some more features
-	# (like the ability to navigate to threads)
-	is_local = False
-
-	if datasource in list(fourcat_modules.datasources.keys()):
-		is_local = True if fourcat_modules.datasources[datasource].get("is_local") else False
-
-	# Check if we have to sort the data in a specific way.
-	sort_by = request.args.get("sort")
-	if sort_by == "dataset-order":
-		sort_by = None
-
-	# Check if we have to reverse the order.
-	descending = request.args.get("desc")
-	if descending == "true" or descending == True:
-		descending = True
-	else:
-		descending = False
-
-	# Check if we have to convert the sort value to an integer.
-	force_int = request.args.get("int")
-	if force_int == "true" or force_int == True:
-		force_int = True
-	else:
-		force_int = False
-
-	# Load posts
-	post_ids = []
-	posts = []
-	count = 0
-
-	first_post = False
-
-	for post in iterate_items(results_path, max_rows=max_posts, sort_by=sort_by, descending=descending, force_int=force_int):
-
-		count += 1
-
-		# Use an offset if we're showing a page beyond the first.
-		if count <= offset:
-			continue
-
-		# Attribute column names and collect dataset's posts.
-		post_ids.append(post["id"])
-		posts.append(post)
-
-		if "link_id" in post:
-			if post["link_id"][2] == "_":
-				post["link_id"] = post["link_id"][3:]
-
-		# Stop if we exceed the max posts per page.
-		if count >= (offset + limit) or count > max_posts:
-			break
-
-	# Include custom css if it exists in the datasource's 'explorer' dir.
-	# The file's naming format should e.g. be 'reddit-explorer.css'.
-	css = get_custom_css(datasource)
-
-	# Include custom fields if it they are in the datasource's 'explorer' dir.
-	# The file's naming format should e.g. be 'reddit-explorer.json'.
-	# For some datasources (e.g. Twitter) we also have to explicitly set
-	# what data type we're working with.
-	filetype = dataset.get_extension()
-	custom_fields = get_custom_fields(datasource, filetype=filetype)
-
-	# Convert posts from markdown to HTML
-	if custom_fields and "markdown" in custom_fields and custom_fields.get("markdown"):
-		posts = [convert_markdown(post) for post in posts]
-	# Clean up HTML
-	else:
-		posts = [strip_html(post) for post in posts]
-		posts = [format(post, datasource=datasource) for post in posts]
-
-	if not posts:
-		return error(404, error="No posts available for this datasource")
-
-	# Check whether there's already annotations inserted already.
-	# If so, also pass these to the template.
-	annotations = db.fetchone("SELECT * FROM annotations WHERE key = %s", (key,))
-	if not annotations or not annotations.get("annotations"):
-		annotations = None
-	else:
-		annotations = json.loads(annotations["annotations"])
-
-	# Generate the HTML page
-	return render_template("explorer/explorer.html", key=key, datasource=datasource, board=board, is_local=is_local, parameters=parameters, annotation_fields=annotation_fields, annotations=annotations, posts=posts, custom_css=css, custom_fields=custom_fields, page=page, offset=offset, limit=limit, post_count=post_count, max_posts=max_posts)
-
-@app.route('/explorer/thread/<datasource>/<board>/<string:thread_id>')
-@api_ratelimit
-@login_required
-@setting_required("privileges.can_use_explorer")
-@openapi.endpoint("explorer")
-def explorer_thread(datasource, board, thread_id):
-	"""
-	Show a thread in the explorer
-
-	:param str datasource:  Data source ID
-	:param str board:  Board name
-	:param int thread_id:  Thread ID
-
-	:return-error 404:  If the thread ID does not exist for the given data source.
-	"""
-
-	if not datasource:
-		return error(404, error="No datasource provided")
-	if datasource not in config.get('datasources.enabled'):
-		return error(404, error="Invalid data source")
-	if not board:
-		return error(404, error="No board provided")
-	if not thread_id:
-		return error(404, error="No thread ID provided")
-
-	# The amount of posts that may be included (limit for large datasets)
-	max_posts = config.get('explorer.max_posts', 500000)
-
-	# Get the posts with this thread ID.
-	posts = get_posts(db, datasource, board=board, ids=tuple([thread_id]), threads=True, order_by=["id"])
-
-	if not posts:
-		return error(404, error="No posts available for this thread")
-
-	posts = [strip_html(post) for post in posts]
-	posts = [format(post, datasource=datasource) for post in posts]
-
-	# Include custom css if it exists in the datasource's 'explorer' dir.
-	# The file's naming format should e.g. be 'reddit-explorer.css'.
-	css = get_custom_css(datasource)
-
-	# Include custom fields if it they are in the datasource's 'explorer' dir.
-	# The file's naming format should e.g. be 'reddit-explorer.json'.
-	custom_fields = get_custom_fields(datasource)
-
-	return render_template("explorer/explorer.html", datasource=datasource, board=board, posts=posts, custom_css=css, custom_fields=custom_fields, limit=len(posts), post_count=len(posts), thread=thread_id, max_posts=max_posts)
-
-@app.route('/explorer/post/<datasource>/<board>/<string:post_id>')
-@api_ratelimit
-@login_required
-@setting_required("privileges.can_use_explorer")
-@openapi.endpoint("explorer")
-def explorer_post(datasource, board, thread_id):
-	"""
-	Show a thread in the explorer
-
-	:param str datasource:  Data source ID
-	:param str board:  Board name
-	:param int thread_id:  Thread ID
-
-	:return-error 404:  If the thread ID does not exist for the given data source.
-	"""
-
-	if not datasource:
-		return error(404, error="No datasource provided")
-	if datasource not in config.get('datasources.enabled'):
-		return error(404, error="Invalid data source")
-	if not board:
-		return error(404, error="No board provided")
-	if not thread_id:
-		return error(404, error="No thread ID provided")
-
-	# Get the posts with this thread ID.
-	posts = get_posts(db, datasource, board=board, ids=tuple([thread_id]), threads=True, order_by=["id"])
-
-	posts = [strip_html(post) for post in posts]
-	posts = [format(post) for post in posts]
-
-	# Include custom css if it exists in the datasource's 'explorer' dir.
-	# The file's naming format should e.g. be 'reddit-explorer.css'.
-	css = get_custom_css(datasource)
-
-	# Include custom fields if it they are in the datasource's 'explorer' dir.
-	# The file's naming format should e.g. be 'reddit-explorer.json'.
-	custom_fields = get_custom_fields(datasource)
-
-	return render_template("explorer/explorer.html", datasource=datasource, board=board, posts=posts, custom_css=css, custom_fields=custom_fields, limit=len(posts), post_count=len(posts))
-
-@app.route("/explorer/save_annotation_fields/<string:key>", methods=["POST"])
-@api_ratelimit
-@login_required
-@setting_required("privileges.can_run_processors")
-@setting_required("privileges.can_use_explorer")
-@openapi.endpoint("explorer")
-def save_annotation_fields(key):
-	"""
-	Save the annotation fields of a dataset to the datasets table.
-	If the changes to the annotation fields affect existing annotations,
-	this function also updates or deleted those old values.
-
-	:param str key:  The dataset key
-
-	:return-error 404:  If the dataset ID does not exist.
-	"""
-
-	if not key:
-		return error(404, error="No dataset key provided")
-
-	# Do some preperations
-	new_fields = request.get_json()
-	new_field_ids = set(new_fields.keys())
-	text_fields = ["textarea", "text"]
-	option_fields = set()
-
-	# Get dataset info.
-	dataset = db.fetchone("SELECT key, annotation_fields FROM datasets WHERE key = %s;", (key,))
-
-	if not dataset:
-		return error(404, error="Dataset not found")
-
-	# We're saving the annotation fields as-is
-	db.execute("UPDATE datasets SET annotation_fields = %s WHERE key = %s;", (json.dumps(new_fields), key))
-
-	# If fields and annotations were saved before, we must also check whether we need to
-	# change old annotation data, for instance when a field is deleted or its label has changed.
-
-	# Get the annotation fields that were already saved to check what's changed.
-	old_fields = dataset.get("annotation_fields")
-	if old_fields:
-		old_fields = json.loads(old_fields)
-
-	# Get the annotations
-	if old_fields:
-		annotations = db.fetchone("SELECT annotations FROM annotations WHERE key = %s;", (key,))
-		if annotations and "annotations" in annotations:
-			if not annotations["annotations"]:
-				annotations = None
-			else:
-				annotations = json.loads(annotations["annotations"])
-
-	# If there's old fields *and* annotations saved, we need to check if we need to update stuff.
-	if old_fields and annotations:
-
-		fields_to_delete = set()
-		labels_to_update = {}
-		options_to_delete = set()
-		options_to_update = {}
-
-		for field_id, field in old_fields.items():
-
-			# We'll delete all prior annotations for a field if its input field is deleted
-			if field_id not in new_field_ids:
-
-				# Labels are used as keys in the annotations table
-				# They should already be unique, so that's okay.
-				fields_to_delete.add(field["label"])
-				continue
-
-			# If the type has changed, also delete prior references (except between text and textarea)
-			new_type = new_fields[field_id]["type"]
-			if field["type"] != new_type:
-
-				if not field["type"] in text_fields and not new_type in text_fields:
-					fields_to_delete.add(field["label"])
-					continue
-
-			# If the label has changed, change it in the old annotations
-			old_label = old_fields[field_id]["label"]
-			new_label = new_fields[field_id]["label"]
-
-			if old_label != new_label:
-				labels_to_update[old_label] = new_label
-
-			# Check if the options for dropdowns or checkboxes have changed
-			if new_type == "checkbox" or new_type == "dropdown":
-
-				if "options" in old_fields[field_id]:
-
-					option_fields.add(old_fields[field_id]["label"])
-					new_options = new_fields[field_id]["options"]
-
-					new_ids = [list(v.keys())[0] for v in new_options]
-					new_ids = [list(v.keys())[0] for v in new_options]
-
-					# If it's a dropdown or checkbox..
-					for option in old_fields[field_id]["options"]:
-						option_id = list(option.keys())[0]
-						option_label = list(option.values())[0]
-
-						# If this ID is not present anymore, delete it
-						if option_id not in new_ids:
-							options_to_delete.add(option_label)
-							continue
-
-						# Change the label if it has changed. Bit ugly but it works.
-						new_label = [list(new_option.values())[0] for i, new_option in enumerate(new_options) if list(new_options[i].keys())[0] == option_id][0]
-
-						if option_label != new_label:
-							options_to_update[option_label] = new_label
-
-		# Loop through the old annotations if things need to be changed
-		if fields_to_delete or labels_to_update or options_to_update or options_to_delete:
-
-			for post_id in list(annotations.keys()):
-
-				for field_label in list(annotations[post_id].keys()):
-
-					# Delete the field entirely
-					if field_label in fields_to_delete:
-						del annotations[post_id][field_label]
-						continue
-
-					# Update the label
-					if field_label in labels_to_update:
-						annotations[post_id][labels_to_update[field_label]] = annotations[post_id].pop(field_label)
-						field_label = labels_to_update[field_label]
-
-					# Update or delete option values
-					if field_label in option_fields:
-						options_inserted = annotations[post_id][field_label]
-
-						# We can just delete/change the entire annotation if its a string
-						if type(options_inserted) == str:
-
-							# Delete the option if it's not present anymore
-							if options_inserted in options_to_delete:
-								del annotations[post_id][field_label]
-
-							# Update the option label if it has changed
-							elif options_inserted in options_to_update:
-								annotations[post_id][field_label] = options_to_update[options_inserted]
-
-						# For lists (i.e. checkboxes), we have to loop
-						elif type(options_inserted) == list:
-
-							for option_inserted in options_inserted:
-
-								# Delete the option if it's not present anymore
-								if option_inserted in options_to_delete:
-									annotations[post_id][field_label].remove(option_inserted)
-
-								# Update the option label if it has changed
-								elif option_inserted in options_to_update:
-									annotations[post_id][field_label] = options_to_update[option_inserted]
-
-				# Delete entire post dict if there's nothing left
-				if not annotations[post_id]:
-					del annotations[post_id]
-
-			# Save annotations as an empty string if there's none.
-			if not annotations:
-				annotations = ""
-			else:
-				annotations = json.dumps(annotations)
-
-			# Insert into the annotations table.
-			db.execute("INSERT INTO annotations(key, annotations) VALUES(%s, %s) ON CONFLICT (key) DO UPDATE SET annotations = %s ", (key, annotations, annotations))
-
-	return "success"
-
-@app.route("/explorer/save_annotations/<string:key>", methods=["POST"])
-@api_ratelimit
-@login_required
-@setting_required("privileges.can_run_processors")
-@setting_required("privileges.can_use_explorer")
-@openapi.endpoint("explorer")
-def save_annotations(key):
-	"""
-	Save the annotations of a dataset to the annotations table.
-
-	:param str key:  The dataset key
-
-	:return-error 404:  If the dataset ID does not exist.
-	"""
-
-	if not key:
-		return error(404, error="No dataset key provided")
-
-	new_annotations = request.get_json()
-
-	# If there were already annotations added, we need to make sure
-	# we're not incorrectly overwriting any.
-	# We also need to check whether any of the input fields have changed.
-	# If so, we're gonna edit or remove their old values.
-	old_annotations = db.fetchone("SELECT annotations FROM annotations WHERE key = %s;", (key,))
-
-	if old_annotations:
-
-		if "annotations" in old_annotations and old_annotations["annotations"]:
-			old_annotations = json.loads(old_annotations["annotations"])
-
-			# Loop through all new annotations and add/overwrite them
-			# with the old annotations dict.
-			for post_id in list(new_annotations.keys()):
-				old_annotations[post_id] = new_annotations[post_id]
-				if not old_annotations[post_id]:
-					del old_annotations[post_id]
-
-			new_annotations = old_annotations
-
-	if not new_annotations:
-		new_annotations = ""
-	else:
-		new_annotations = json.dumps(new_annotations)
-
-	# We're saving all annotations as a JSON string in one go
-	db.execute("INSERT INTO annotations(key, annotations) VALUES(%s, %s) ON CONFLICT (key) DO UPDATE SET annotations = %s ", (key, new_annotations, new_annotations))
-
-	return "success"
-
-@app.route('/api/<datasource>/boards.json')
-@api_ratelimit
-@login_required
-@setting_required("privileges.can_use_explorer")
-@openapi.endpoint("data")
-def get_boards(datasource):
-	"""
-	Get available boards in datasource
-
-	:param datasource:  The datasource for which to acquire the list of available
-	                  boards.
-	:return:  A list containing a list of `boards`, as string IDs.
-
-	:return-schema: {type=object,properties={
-		boards={type=array,items={type=object,properties={
-			board={type=string}
-		}}}
-	}}
-
-	:return-error 404: If the datasource does not exist.
-	"""
-	if datasource not in config.get('datasources.enabled'):
-		return error(404, error="Invalid data source")
-
-	boards = db.fetchall("SELECT DISTINCT board FROM threads_" + datasource)
-	return jsonify({"boards": [{"board": board["board"]} for board in boards]})
-
-@app.route('/api/image/<img_file>')
-@app.route('/api/imagefile/<img_file>')
-@login_required
-@setting_required("privileges.can_use_explorer")
-def get_image_file(img_file, limit=0):
-	"""
-	Returns an image based on filename
-	Request should hex the md5 hashes first (e.g. with hexdigest())
-
-	"""
-	if not re.match(r"([a-zA-Z0-9]+)\.([a-z]+)", img_file):
-		abort(404)
-
-	image_path = Path(config.get('PATH_ROOT'), config.get('PATH_IMAGES'), img_file)
-	if not image_path.exists():
-		abort(404)
-
-	return send_file(str(image_path))
-
-def iterate_items(in_file, max_rows=None, sort_by=None, descending=False, force_int=False):
-	"""
-	Loop through both csv and NDJSON files.
-	:param in_file, str:		The input file to read.
-	:param sort_by, str:		The key that determines the sort order.
-	:param descending, bool:	Whether to sort by descending values.
-	:param force_int, bool:		Whether the sort value should be converted to an
-								integer.
-	"""
-
-	suffix = in_file.name.split(".")[-1].lower()
-
-	if suffix == "csv":
-
-		with open(in_file, "r", encoding="utf-8") as dataset_file:
-
-			# Sort on date by default
-			# Unix timestamp integers are not always saved in the same field.
-			reader = csv.reader(dataset_file)
-			columns = next(reader)
-			if sort_by:
-				try:
-					# Get index number of sort_by value
-					sort_by_index = columns.index(sort_by)
-
-					# Generate reader on the basis of sort_by value
-					reader = sorted(reader, key=lambda x: to_float(x[sort_by_index], convert=force_int) if len(x) >= sort_by_index else 0, reverse=descending)
-
-				except (ValueError, IndexError) as e:
-					pass
-
-			for item in reader:
-
-				# Add columns
-				item = {columns[i]: item[i] for i in range(len(item))}
-
-				yield item
-
-	elif suffix == "ndjson":
-
-		# In this format each line in the file is a self-contained JSON
-		# file
-		with open(in_file, "r", encoding="utf-8") as dataset_file:
-
-			# Unfortunately we can't easily sort here.
-			# We're just looping through the file if no sort is given.
-			if not sort_by:
-				for line in dataset_file:
-					item = json.loads(line)
-					yield item
-
-			# If a sort order is given explicitly, we're sorting anyway.
-			else:
-				keys = sort_by.split(".")
-
-				if max_rows:
-					for item in sorted([json.loads(line) for i, line in enumerate(dataset_file) if i < max_rows], key=lambda x: to_float(get_nested_value(x, keys), convert=force_int), reverse=descending):
-							yield item
-				else:
-					for item in sorted([json.loads(line) for line in dataset_file], key=lambda x: to_float(get_nested_value(x, keys), convert=force_int), reverse=descending):
-							yield item
-
-	return Exception("Can't loop through file with extension %s" % suffix)
-
-def get_posts(db, datasource, ids, board="", threads=False, limit=0, offset=0, order_by=["timestamp"]):
-
-	if not ids:
-		return None
-
-	if board:
-		board = " AND board = '" + board + "' "
-
-	id_field = "id" if not threads else "thread_id"
-	order_by = " ORDER BY " + ", ".join(order_by)
-	limit = "" if not limit or limit <= 0 else " LIMIT %i" % int(limit)
-	offset = " OFFSET %i" % int(offset)
-
-	posts = db.fetchall("SELECT * FROM posts_" + datasource + " WHERE " + id_field + " IN %s " + board + order_by + " ASC" + limit + offset,
-						(ids,))
-	if not posts:
-		return False
-
-	return posts
-
-def get_custom_css(datasource):
-	"""
-	Check if there's a custom css file for this dataset.
-	If so, return the text.
-	Custom css files should be placed in an 'explorer' directory in the the datasource folder and named
-	'<datasourcename>-explorer.css' (e.g. 'reddit/explorer/reddit-explorer.css').
-	See https://github.com/digitalmethodsinitiative/4cat/wiki/Exploring-and-annotating-datasets for more information.
-
-	:param datasource, str: Datasource name
-
-	:return: The css as string.
-	"""
-
-	# Set the directory name of this datasource.
-	# Some naming inconsistensies are caught here
-	if datasource == "twitter":
-		datasource_dir = "twitter-import"
-		datasource = "twitter-import"
-	else:
-		datasource_dir = datasource
-
-
-	css_path = Path(config.get('PATH_ROOT'), "datasources", datasource_dir, "explorer", datasource.lower() + "-explorer.css")
-
-	print(css_path)
-	read = False
-	if css_path.exists():
-		read = True
-	else:
-		# Allow both hypens and underscores in datasource name (to avoid some legacy issues)
-		css_path = re.sub(datasource, datasource.replace("-", "_"), str(css_path.absolute()))
-		if Path(css_path).exists():
-			read = True
-
-	# Read the css file if it exists
-	if read:
-		with open(css_path, "r", encoding="utf-8") as css:
-			css = css.read()
-	else:
-		css = None
-
-	return css
-
-def get_custom_fields(datasource, filetype=None):
-	"""
-	Check if there are custom fields that need to be showed for this datasource.
-	If so, return a dictionary of those fields.
-	Custom field json files should be placed in an 'explorer' directory in the the datasource folder and named
-	'<datasourcename>-explorer.json' (e.g. 'reddit/explorer/reddit-explorer.json').
-	See https://github.com/digitalmethodsinitiative/4cat/wiki/Exploring-and-annotating-datasets for more information.
-
-	:param datasource, str: Datasource name
-	:param filetype, str:	The filetype that is handled. This can fluctuate
-							between e.g. NDJSON and csv files.
-
-	:return: Dictionary of custom fields that should be shown.
-	"""
-
-	# Set the directory name of this datasource.
-	if datasource == "twitter":
-		datasource_dir = "twitter-import"
-		datasource = "twitter-import"
-	else:
-		datasource_dir = datasource
-
-	json_path = Path(config.get('PATH_ROOT'), "datasources", datasource_dir, "explorer", datasource.lower() + "-explorer.json")
-	read = False
-
-	if json_path.exists():
-		read = True
-	else:
-		# Allow both hypens and underscores in datasource name (to avoid some legacy issues)
-		json_path = re.sub(datasource, datasource.replace("-", "_"), str(json_path.absolute()))
-		if Path(json_path).exists():
-			read = True
-
-	if read:
-		with open(json_path, "r", encoding="utf-8") as json_file:
-			try:
-				custom_fields = json.load(json_file)
-			except ValueError as e:
-				return ("invalid", e)
-	else:
-		custom_fields = None
-
-	filetype = filetype.replace(".", "")
-	if filetype and custom_fields:
-		if filetype in custom_fields:
-			custom_fields = custom_fields[filetype]
-	else:
-		custom_fields = None
-
-	return custom_fields
-
-def get_nested_value(di, keys):
-	"""
-	Gets a nested value on the basis of a dictionary and a list of keys.
-	"""
-
-	for key in keys:
-		di = di.get(key)
-		if not di:
-			return 0
-	return di
-
-def to_float(value, convert=False):
-	if convert:
-		if not value:
-			value = 0
-		else:
-			value = float(value)
-	return value
-
-def strip_html(post):
-	post["body"] = strip_tags(post.get("body", ""))
-	return post
-
-def format(post, datasource=""):
-	if "chan" in datasource or datasource == "8kun":
-		post["body"] = format_chan_post(post.get("body", ""))
-	post["body"] = post.get("body", "").replace("\n", "<br>")
-	return post
-
-def convert_markdown(post):
-	post["body"] = post.get("body", "").replace("\n", "\n\n").replace("&gt;", ">").replace("] (", "](")
-	post["body"] = markdown2.markdown(post.get("body", ""), extras=["nofollow","target-blank-links"])
-	return post
\ No newline at end of file
diff --git a/webtool/views/views_explorer.py b/webtool/views/views_explorer.py
index ea7073e6c..51e9acc66 100644
--- a/webtool/views/views_explorer.py
+++ b/webtool/views/views_explorer.py
@@ -3,13 +3,11 @@
 format and lets users annotate the data.
 """
 
-import json
-
 from pathlib import Path
 
 from flask import request, render_template, jsonify
 from flask_login import login_required, current_user
-from webtool import app, db, openapi, limiter, config
+from webtool import app, db, openapi, limiter, config, fourcat_modules
 from webtool.lib.helpers import error, setting_required
 from common.lib.dataset import DataSet
 from common.lib.helpers import convert_to_float, hash_to_md5
@@ -39,7 +37,7 @@ def explorer_dataset(dataset_key: str, page=1, show_annotations=False):
 
 	# Get dataset info.
 	try:
-		dataset = DataSet(key=dataset_key, db=db)
+		dataset = DataSet(key=dataset_key, db=db, modules=fourcat_modules)
 	except DataSetException:
 		return error(404, error="Dataset not found.")