diff --git a/backend/database.sql b/backend/database.sql index 1f372a697..b4f621cbb 100644 --- a/backend/database.sql +++ b/backend/database.sql @@ -68,11 +68,41 @@ CREATE TABLE datasets_owners ( CREATE UNIQUE INDEX datasets_owners_user_key_idx ON datasets_owners("name" text_ops,key text_ops); - -- annotations CREATE TABLE IF NOT EXISTS annotations ( - key text UNIQUE PRIMARY KEY, - annotations text DEFAULT '' + id SERIAL PRIMARY KEY, + dataset TEXT, + field_id TEXT, + item_id TEXT, + timestamp INT DEFAULT 0, + timestamp_created INT DEFAULT 0, + label TEXT, + type TEXT, + options TEXT, + value TEXT, + author TEXT, + author_original TEXT, + by_processor BOOLEAN DEFAULT FALSE, + metadata TEXT +); + +CREATE UNIQUE INDEX IF NOT EXISTS annotation_id + ON annotations ( + id +); +CREATE UNIQUE INDEX IF NOT EXISTS annotation_unique + ON annotations ( + label, + dataset, + item_id +); +CREATE INDEX IF NOT EXISTS annotation_value + ON annotations ( + value +); +CREATE INDEX IF NOT EXISTS annotation_timestamp + ON annotations ( + timestamp ); -- metrics diff --git a/backend/lib/processor.py b/backend/lib/processor.py index 29efde8c4..52ad88cec 100644 --- a/backend/lib/processor.py +++ b/backend/lib/processor.py @@ -10,6 +10,7 @@ import abc import csv import os +import random from pathlib import Path, PurePath @@ -18,7 +19,7 @@ from common.lib.fourcat_module import FourcatModule from common.lib.helpers import get_software_commit, remove_nuls, send_email from common.lib.exceptions import (WorkerInterruptedException, ProcessorInterruptedException, ProcessorException, - DataSetException, MapItemException) + DataSetException, MapItemException, AnnotationException) from common.config_manager import config, ConfigWrapper from common.lib.user import User @@ -402,7 +403,7 @@ def add_field_to_parent(self, field_name, new_data, which_parent=source_dataset, TODO: could be improved by accepting different types of data depending on csv or ndjson. - :param str field_name: name of the desired + :param str field_name: Name of the desired new field :param List new_data: List of data to be added to parent dataset :param DataSet which_parent: DataSet to be updated (e.g., self.source_dataset, self.dataset.get_parent(), self.dataset.top_parent()) :param bool update_existing: False (default) will raise an error if the field_name already exists @@ -721,6 +722,63 @@ def create_standalone(self): return standalone + def write_annotations(self, annotations: list, source_dataset=None, overwrite=False) -> int: + """ + Saves annotations made by this processor on the basis of another dataset. + Also adds some data regarding this processor: set `author` and `label` to processor name, + and add parameters to `metadata` (unless explicitly indicated). + + :param annotations: List of dictionaries with annotation items. Must have `item_id` and `value`. + E.g. [{"item_id": "12345", "label": "Valid", "value": "Yes"}] + :param source_dataset: The dataset that these annotations were based on. + Defaults to the parent dataset. + :param bool overwrite: Whether to overwrite annotations if the label is already present + for the dataset. If this is False and the label is already present, + we'll add a number to the label to differentiate it (e.g. `count-posts1`). + Else we'll just replace the old data. + + :returns int: How many annotations were saved. + + """ + + if not annotations: + return 0 + + # Default to parent dataset + if not source_dataset: + source_dataset = self.source_dataset + + already_exists_error = False + + # Check if this dataset already has annotation fields + existing_labels = source_dataset.get_annotation_field_labels() + + # Set some values + for annotation in annotations: + + # Set the default label to this processor's name + if not annotation.get("label"): + # If the processor has already generated annotation fields, + # add a number to differentiate the label + label = self.name + if not overwrite and label in existing_labels: + label += "-" + str(len([l for l in existing_labels if l.startswith(label)])) + annotation["label"] = label + elif annotation.get("label") and not overwrite: + if annotation["label"] in existing_labels: + raise AnnotationException("Annotation label '%s' already exists for this dataset" % annotation["label"]) + + # Set the author to this processor's name + if not annotation.get("author"): + annotation["author"] = self.name + if not annotation.get("author_original"): + annotation["author_original"] = self.name + + annotation["by_processor"] = True + + annotations_saved = source_dataset.save_annotations(annotations, overwrite=overwrite) + return annotations_saved + @classmethod def map_item_method_available(cls, dataset): """ @@ -856,6 +914,8 @@ def get_extension(self, parent_dataset=None): # A non filter processor updated the base Processor extension to None/False? return None + + @classmethod def is_rankable(cls, multiple_items=True): """ diff --git a/common/lib/annotation.py b/common/lib/annotation.py new file mode 100644 index 000000000..9b05bb197 --- /dev/null +++ b/common/lib/annotation.py @@ -0,0 +1,445 @@ +""" +Annotation class +""" + + +import time +import json + +from common.lib.helpers import hash_to_md5 +from common.lib.exceptions import AnnotationException + + +class Annotation: + """ + Annotation class + + Annotations are always tied to a dataset, a dataset item (e.g. a csv row), + an annotation label, and a type ('text', 'multichoice', etc.). + + """ + + # Attributes must be created here to ensure getattr and setattr work properly + + data = None + db = None + + id = None # Unique ID for this annotation + item_id = None # ID of the item that this annotation was made for, e.g. a post ID. + field_id = None # ID for the annotation field + dataset = None # Dataset key this annotation is generated from + timestamp = None # When this annotation was edited + timestamp_created = None # When this annotation was created + label = None # Label of annotation + type = None # Type of annotation (e.g. `text`) + options = None # Possible options + value = None # The actual annotation value + author = None # Who last edited the annotation + author_original = None # Who originally made the annotation + by_processor = None # Whether the annotation was made by a processor + metadata = None # Misc metadata + + def __init__(self, data=None, annotation_id=None, db=None): + """ + Instantiate annotation object. + + :param data: Annotation data; should correspond to the annotations table record. + :param annotation_id: The ID of an annotation. If given, it retrieves the annotation + from the database. + :param db: Database connection object + """ + + required_fields = ["label", "item_id", "dataset"] + + # Must have an ID or data + if (annotation_id is None and data is None) or (data is not None and not isinstance(data, dict)): + raise AnnotationException("Annotation() requires either a valid `data` dictionary or ID.") + + if not db: + raise AnnotationException("Annotation() needs a `db` database object") + + self.db = db + + new_or_updated = False + + # Get the annotation data if the ID is given; if an annotation has + # an ID, it is guaranteed to be in the database. + # IDs can both be explicitly given or present in the data dict. + if annotation_id is not None or "id" in data: + if data and "id" in data: + annotation_id = data["id"] + self.id = annotation_id # IDs correspond to unique serial numbers in the database. + current = self.get_by_id(annotation_id) + if not current: + raise AnnotationException( + "Annotation() requires a valid ID for an existing annotation, %s given" % id) + + # If an ID is not given, get or create an Annotation object from its data. + # First check if required fields are present in `data`. + else: + for required_field in required_fields: + if required_field not in data or not data[required_field]: + raise AnnotationException("Annotation() requires a %s field" % required_field) + + # Check if this annotation already exists, based on dataset key, item id, and label. + current = self.get_by_field(data["dataset"], data["item_id"], data["label"]) + + # If we were able to retrieve an annotation from the db, it already exists + if current: + # Check if we have to overwrite old data with new data + if data: + for key, value in data.items(): + # Save unknown fields in metadata + if key not in current: + current["metadata"][key] = value + new_or_updated = True + # If values differ, update the value + elif current[key] != value: + current[key] = value + new_or_updated = True + + self.data = current + + # If this is a new annotation, set all the properties. + else: + # Keep track of when the annotation was made + created_timestamp = int(time.time()) + + new_data = { + "dataset": data["dataset"], + "item_id": data["item_id"], + "field_id": data["field_id"] + if data.get("field_id") else self.set_field_id(data["dataset"], data["label"]), + "timestamp": created_timestamp, + "timestamp_created": created_timestamp, + "label": data["label"], + "type": data.get("type", "text"), + "options": data.get("options", ""), + "value": data.get("value", ""), + "author": data.get("author", ""), + "author_original": data.get("author", ""), + "by_processor": data.get("by_processor", False), + "metadata": data.get("metadata", {}), + } + + self.data = new_data + new_or_updated = True + + if isinstance(self.data["metadata"], str): + try: + self.metadata = json.loads(self.data["metadata"]) + except (TypeError, json.JSONDecodeError): + self.metadata = {} + + for k, v in self.data.items(): + # Some type checking + try: + if k == "timestamp" or k == "timestamp_created": + v = int(v) + elif k == "by_processor": + v = bool(v) + except ValueError as e: + raise AnnotationException("Annotation fields are not of the right type (%s)" % e) + self.__setattr__(k, v) + + # Write to db if anything changed + if new_or_updated: + self.timestamp = int(time.time()) + self.write_to_db() + + def get_by_id(self, annotation_id: int): + """ + Get annotation by ID + + :param str annotation_id: ID of annotation. + :return: Annotation object, or an empty dict if the ID doesn't exist. + """ + + try: + int(annotation_id) + except ValueError: + raise AnnotationException("Id '%s' is not valid" % annotation_id) + + data = self.db.fetchone("SELECT * FROM annotations WHERE id = %s" % annotation_id) + + if not data: + return {} + + if data["type"] == "checkbox": + data["value"] = data["value"].split(",") + data["metadata"] = json.loads(data["metadata"]) + + return data + + def get_by_field(self, dataset_key: str, item_id: str, label: str) -> dict: + """ + Get the annotation information via its dataset key, item ID, and field_id. + This is always a unique combination. + + :param dataset_key: The key of the dataset this annotation was made for. + :param item_id: The ID of the item this annotation was made for. + :param label: The label of the annotation. + + :return data: A dict with data of the retrieved annotation, or an empty dict if it doesn't exist. + """ + + data = self.db.fetchone("SELECT * FROM annotations WHERE dataset = %s AND item_id = %s AND label = %s", + (dataset_key, str(item_id), label)) + if not data: + return {} + + if data["type"] == "checkbox": + data["value"] = data["value"].split(",") + data["metadata"] = json.loads(data["metadata"]) + + return data + + def set_field_id(self, dataset_key: str, label: str) -> str: + """ + Sets a `field_id` based on the dataset key and label. + This combination should be unique. + + :param dataset_key: The dataset key + :param label: The label of the dataset. + """ + + base_field_id = dataset_key + label + field_id = hash_to_md5(base_field_id) + self.field_id = field_id + return self.field_id + + def write_to_db(self): + """ + Write an annotation to the database. + """ + db_data = self.data + + db_data["timestamp"] = int(time.time()) + m = db_data["metadata"] # To avoid circular reference error + db_data["metadata"] = json.dumps(m) + if db_data["type"] == "checkbox": + db_data["value"] = ",".join(db_data["value"]) + + return self.db.upsert("annotations", data=db_data, constraints=["label", "dataset", "item_id"]) + + def delete(self): + """ + Deletes this annotation + """ + return self.db.delete("annotations", {"id": self.id}) + + @staticmethod + def delete_many(db, dataset_key=None, annotation_id=None, field_id=None): + """ + Deletes annotations for an entire dataset or by a list of (field) IDs. + + :param db: Database object. + :param str dataset_key: A dataset key. + :param li annotation_id: A list or string of unique annotation IDs. + :param li field_id: A list or string of IDs for annotation fields. + + :return int: The number of removed records. + """ + if not dataset_key and not annotation_id and not field_id: + return 0 + + where = {} + if dataset_key: + where["dataset"] = dataset_key + if annotation_id: + where["id"] = annotation_id + if field_id: + where["field_id"] = field_id + + return db.delete("annotations", where) + + @staticmethod + def update_annotations_via_fields(dataset_key, old_fields: dict, new_fields: dict, db) -> int: + """ + Updates annotations in the annotations table if the input fields + themselves have been changed, for instance if a dropdown label is renamed + or a field is deleted. + + :param str dataset_key: The dataset key for which fields changed. + :param dict old_fields: Old annotation fields. + :param dict new_fields: New annotation fields; this should contain not just + the additions, but all fields, changed or otherwise. + :param db: Database object so we can write. + + :returns int: How many records were affected. + """ + + text_fields = ["textarea", "text"] + + # If old and new fields are identical, do nothing. + if old_fields == new_fields: + return 0 + + fields_to_delete = set() # Delete all annotations with this field ID + fields_to_update = {} # Update values of annotations with this field ID + old_options = {} + + # Loop through the old annotation fields + for old_field_id, old_field in old_fields.items(): + + # Delete all annotations of this type if the field is deleted. + if old_field_id not in new_fields: + fields_to_delete.add(old_field_id) + continue + + field_id = old_field_id + new_field = new_fields[field_id] + + # If the annotation type has changed, also delete existing annotations, + # except between text and textarea, where we can just change the type and keep the text. + if old_field["type"] != new_field["type"]: + if not old_field["type"] in text_fields and not new_field["type"] in text_fields: + fields_to_delete.add(field_id) + continue + + # Loop through all the key/values in the new field settings + # and update in case it's different from the old values. + update_data = {} + for field_key, field_value in new_field.items(): + + # Update if values don't match + if field_value != old_field.get(field_key): + + # Special case: option values that are removed/renamed. + # Here we may have to change/delete values within the + # values column. + if field_key == "options": + + new_options = field_value + + # Edge case: delete annotations of this type if all option fields are deleted + if not new_options: + fields_to_delete.add(field_id) + continue + + # Changed options values (e.g. renamed or one field deleted) + old_options[old_field_id] = old_field.get("options", {}) + options_to_update = {} + if old_options[old_field_id] and old_options != new_options: + options_to_update = new_options + + # # Options are saved in a dict with IDs as keys and labels as values. + # for old_option_id, old_option in old_options.items(): + # + # # Renamed option label + # if old_option_id in new_options and old_option != new_options[old_option_id]: + # options_to_update[old_option] = new_options[old_option_id] # Old label -> new label + # # Deleted option + # elif old_option_id not in new_options: + # options_to_update[old_option] = None # Remove None labels later + + if options_to_update: + update_data[field_key] = {"options": options_to_update} + + # For all other changes, just overwrite with new data. + else: + update_data[field_key] = field_value + + if update_data: + fields_to_update[field_id] = update_data + + # Delete annotations + if fields_to_delete: + Annotation.delete_many(db, field_id=list(fields_to_delete)) + + # Write changes to fields to database + count = 0 + if fields_to_update: + for field_id, updates in fields_to_update.items(): + + # Write to db + for column, update_value in updates.items(): + + update_value_insert = update_value + if column == "options": + update_value_insert = ",".join(list(update_value["options"].values())) + + # Change values of columns + updates = db.update("annotations", {column: update_value_insert}, + where={"dataset": dataset_key, "field_id": field_id}) + count += updates + + # Special case: Changed option labels. + # Here we have to also rename/remove inserted options from the `value` column. + if column == "options": + + annotations = db.fetchall("SELECT id, options, value FROM annotations " + "WHERE dataset = '%s' and field_id = '%s' AND value != '';" + % (dataset_key, field_id)) + + for annotation in annotations: + annotation_id = annotation["id"] + annotation_values = annotation["value"].split(",") + + # Remove or rename options + new_values = [] + new_options = update_value["options"] # Dict with option id->label as items + + for ann_value in annotation_values: + # Get the option ID, so we can see if it's new, deleted, or renamed. + # Should always be present in old options dict + option_id = [k for k, v in old_options[field_id].items() if v == ann_value][0] + # Deleted... + if option_id not in new_options: + continue + # Or replaced with a new, possibly renamed value + else: + new_values.append(new_options[option_id]) + + new_values = ",".join(new_values) + db.update("annotations", {"value": new_values}, where={"id": annotation_id}) + + return count + + def __getattr__(self, attr): + """ + Getter so we don't have to use .data all the time + + :param attr: Data key to get + :return: Value + """ + + if attr in dir(self): + # an explicitly defined attribute should always be called in favour + # of this passthrough + attribute = getattr(self, attr) + return attribute + elif attr in self.data: + return self.data[attr] + else: + raise AttributeError("Annotation instance has no attribute %s" % attr) + + def __setattr__(self, attr, value): + """ + Setter so we can flexibly update the database + + Also updates internal data stores (.data etc.). If the attribute is + unknown, it is stored within the 'metadata' attribute. + + :param str attr: Attribute to update + :param value: New value + """ + + # don't override behaviour for *actual* class attributes + if attr in dir(self): + super().__setattr__(attr, value) + return + + if attr not in self.data: + self.metadata[attr] = value + attr = "metadata" + value = self.metadata + + if attr == "metadata": + value = json.dumps(value) + + self.db.update("annotations", where={"id": self.id}, data={attr: value}) + + self.data[attr] = value + if attr == "metadata": + self.metadata = json.loads(value) diff --git a/common/lib/config_definition.py b/common/lib/config_definition.py index ee38ce705..4138ef4d0 100644 --- a/common/lib/config_definition.py +++ b/common/lib/config_definition.py @@ -4,18 +4,27 @@ Possible options and their default values. Options are actually set in 4CAT"s Database. Additional options can be defined in Data sources or Processors as `config` objects. + +The order of th dictionary below determines the order of the settings in the interface. + """ from common.lib.user_input import UserInput import json config_definition = { - "datasources._intro": { + "datasources.intro": { "type": UserInput.OPTION_INFO, "help": "Data sources enabled below will be offered to people on the 'Create Dataset' page. Additionally, " "people can upload datasets for these by for example exporting them with " "[Zeeschuimer](https://github.com/digitalmethodsinitiative/zeeschuimer) to this 4CAT instance.\n\n" "Some data sources offer further settings which may be configured on other tabs." }, + "datasources.intro2": { + "type": UserInput.OPTION_INFO, + "help": "*Warning:* changes take effect immediately. Datasets that would have expired under the new settings " + "will be deleted. You can use the 'Dataset bulk management' module in the control panel to manage the " + "expiration status of existing datasets." + }, "datasources.enabled": { "type": UserInput.OPTION_DATASOURCES, "default": ["ninegag", "douban", "douyin", "imgur", "upload", "instagram", "linkedin", "parler", @@ -23,12 +32,6 @@ "help": "Data Sources", "tooltip": "A list of enabled data sources that people can choose from when creating a dataset page." }, - "datasources._intro2": { - "type": UserInput.OPTION_INFO, - "help": "*Warning:* changes take effect immediately. Datasets that would have expired under the new settings " - "will be deleted. You can use the 'Dataset bulk management' module in the control panel to manage the " - "expiration status of existing datasets." - }, "datasources.expiration": { "type": UserInput.OPTION_TEXT_JSON, "default": {"fourchan": {"enabled": False, "allow_optout": False, "timeout": 0}, "eightchan": {"enabled": False, "allow_optout": False, "timeout": 0}, "eightkun": {"enabled": False, "allow_optout": False, "timeout": 0}, "ninegag": {"enabled": True, "allow_optout": False, "timeout": 0}, "bitchute": {"enabled": True, "allow_optout": False, "timeout": 0}, "dmi-tcat": {"enabled": False, "allow_optout": False, "timeout": 0}, "dmi-tcatv2": {"enabled": False, "allow_optout": False, "timeout": 0}, "douban": {"enabled": True, "allow_optout": False, "timeout": 0}, "douyin": {"enabled": True, "allow_optout": False, "timeout": 0}, "gab": {"enabled": True, "allow_optout": False, "timeout": 0}, "imgur": {"enabled": True, "allow_optout": False, "timeout": 0}, "upload": {"enabled": True, "allow_optout": False, "timeout": 0}, "instagram": {"enabled": True, "allow_optout": False, "timeout": 0}, "linkedin": {"enabled": True, "allow_optout": False, "timeout": 0}, "parler": {"enabled": True, "allow_optout": False, "timeout": 0}, "reddit": {"enabled": False, "allow_optout": False, "timeout": 0}, "telegram": {"enabled": True, "allow_optout": False, "timeout": 0}, "tiktok": {"enabled": True, "allow_optout": False, "timeout": 0}, "tiktok-urls": {"enabled": False, "allow_optout": False, "timeout": 0}, "truthsocial": {"enabled": True, "allow_optout": False, "timeout": 0}, "tumblr": {"enabled": False, "allow_optout": False, "timeout": 0}, "twitter": {"enabled": True, "allow_optout": False, "timeout": 0}, "twitterv2": {"enabled": False, "allow_optout": False, "timeout": 0}, "usenet": {"enabled": False, "allow_optout": False, "timeout": 0}, "vk": {"enabled": False, "allow_optout": False, "timeout": 0}}, @@ -109,8 +112,8 @@ "privileges.can_use_explorer": { "type": UserInput.OPTION_TOGGLE, "default": True, - "help": "Can use explorer", - "tooltip": "Controls whether users can use the Explorer feature to navigate datasets." + "help": "Can use Explorer", + "tooltip": "Controls whether users can use the Explorer feature to analyse and annotate datasets." }, "privileges.can_export_datasets": { "type": UserInput.OPTION_TOGGLE, @@ -302,13 +305,18 @@ "global": True }, # Explorer settings - # The maximum allowed amount of rows (prevents timeouts and memory errors) + "explorer.basic-explanation": { + "type": UserInput.OPTION_INFO, + "help": "4CAT's Explorer feature lets you navigate and annotate datasets as if they " + "appared on their original platform. This is intended to facilitate qualitative " + "exploration and manual coding." + }, "explorer.max_posts": { "type": UserInput.OPTION_TEXT, "default": 100000, "help": "Amount of posts", "coerce_type": int, - "tooltip": "Amount of posts to show in Explorer. The maximum allowed amount of rows (prevents timeouts and " + "tooltip": "Maximum number of posts to be considered by the Explorer (prevents timeouts and " "memory errors)" }, "explorer.posts_per_page": { @@ -316,7 +324,29 @@ "default": 50, "help": "Posts per page", "coerce_type": int, - "tooltip": "Posts to display per page" + "tooltip": "Number of posts to display per page" + }, + "explorer.config_explanation": { + "type": UserInput.OPTION_INFO, + "help": "Per data source, you can enable or disable the Explorer. Posts will be formatted through a generic template " + "made of [this HTML file](https://github.com/digitalmethodsinitiative/4cat/tree/master/webtool/templates/explorer/" + "templates/generic.html) and [this CSS file](https://github.com/digitalmethodsinitiative/4cat/tree/master/webtool/" + "static/css/explorer/generic.css). For various data sources, data source-specific templates are also available. " + "These are made of a custom HTML template in [this directory](https://github.com/digitalmethodsinitiative/4cat/tree/master/" + "webtool/datasource-templates/explorer/templates) and a custom CSS file [in this directory](https://github.com/digitalmethodsinitiative/4cat/tree/master/webtool/static/css/explorer)." + }, + "explorer.config": { + "type": UserInput.OPTION_DATASOURCES_TABLE, + "help": "Explorer settings per data source", + "default": {"fourchan": {"enabled": True}, "eightchan": {"enabled": True}, "eightkun": {"enabled": True}, "ninegag": {"enabled": True}, "bitchute": {"enabled": True}, "dmi-tcat": {"enabled": True}, "dmi-tcatv2": {"enabled": True}, "douban": {"enabled": True}, "douyin": {"enabled": False}, "imgur": {"enabled": True}, "upload": {"enabled": True}, "instagram": {"enabled": True}, "linkedin": {"enabled": True}, "parler": {"enabled": True}, "reddit": {"enabled": True}, "telegram": {"enabled": True}, "tiktok": {"enabled": True}, "tiktok-urls": {"enabled": True}, "tumblr": {"enabled": True}, "twitter": {"enabled": True}, "twitterv2": {"enabled": True}, "usenet": {"enabled": True}, "vk": {"enabled": True}}, + "columns": { + "enabled": { + "type": UserInput.OPTION_TOGGLE, + "help": "Enable Explorer", + "tooltip": "Whether the Explorer is available for this data source", + "default": True + } + } }, # Web tool settings # These are used by the FlaskConfig class in config.py @@ -544,7 +574,7 @@ "4cat": "4CAT Tool settings", "api": "API credentials", "flask": "Flask settings", - "explorer": "Data Explorer", + "explorer": "Explorer", "datasources": "Data sources", "expire": "Dataset expiration settings", "mail": "Mail settings & credentials", diff --git a/common/lib/database.py b/common/lib/database.py index 9166dab4f..eb69a0d2f 100644 --- a/common/lib/database.py +++ b/common/lib/database.py @@ -105,8 +105,8 @@ def update(self, table, data, where=None, commit=True): Update a database record :param string table: Table to update - :param dict where: Simple conditions, parsed as "column1 = value1 AND column2 = value2" etc :param dict data: Data to set, Column => Value + :param dict where: Simple conditions, parsed as "column1 = value1 AND column2 = value2" etc :param bool commit: Whether to commit after executing the query :return int: Number of affected rows. Note that this may be unreliable if `commit` is `False` diff --git a/common/lib/dataset.py b/common/lib/dataset.py index b494acbd3..66060d735 100644 --- a/common/lib/dataset.py +++ b/common/lib/dataset.py @@ -1,7 +1,6 @@ import collections import itertools import datetime -import hashlib import fnmatch import random import shutil @@ -13,13 +12,16 @@ from pathlib import Path from common.config_manager import config +from common.lib.annotation import Annotation from common.lib.job import Job, JobNotFoundException from common.lib.module_loader import ModuleCollector from common.lib.helpers import get_software_commit, NullAwareTextIOWrapper, convert_to_int, get_software_version from common.lib.item_mapping import MappedItem, MissingMappedField, DatasetItem +from common.lib.helpers import get_software_commit, NullAwareTextIOWrapper, convert_to_int, hash_to_md5 +from common.lib.item_mapping import MappedItem, DatasetItem from common.lib.fourcat_module import FourcatModule from common.lib.exceptions import (ProcessorInterruptedException, DataSetException, DataSetNotFoundException, - MapItemException, MappedItemIncompleteException) + MapItemException, MappedItemIncompleteException, AnnotationException) class DataSet(FourcatModule): @@ -238,7 +240,7 @@ def clear_log(self): extension. """ log_path = self.get_log_path() - with log_path.open("w") as outfile: + with log_path.open("w"): pass def log(self, log): @@ -358,9 +360,12 @@ def iterate_items(self, processor=None, warn_unmappable=True, map_missing="defau if own_processor and own_processor.map_item_method_available(dataset=self): item_mapper = True + # Annotations are dynamically added and we're handling them as 'extra' map_item fields. + annotation_labels = self.get_annotation_field_labels() + # missing field strategy can be for all fields at once, or per field # if it is per field, it is a dictionary with field names and their strategy - # if it is for all fields, it is may be a callback, 'abort', or 'default' + # if it is for all fields, it may be a callback, 'abort', or 'default' default_strategy = "default" if type(map_missing) is not dict: default_strategy = map_missing @@ -398,10 +403,32 @@ def iterate_items(self, processor=None, warn_unmappable=True, map_missing="defau mapped_item.data[missing_field] = mapped_item.data[missing_field].value else: raise ValueError("map_missing must be 'abort', 'default', or a callback.") - else: mapped_item = original_item + # Add possible annotations + if annotation_labels: + + # We're always handling annotated data as a MappedItem object, + # even if no map_item() function is available for the data source. + if not isinstance(mapped_item, MappedItem): + mapped_item = MappedItem(mapped_item) + + # Get annotations for this specific post + post_annotations = self.get_annotations(item_id=mapped_item.data["id"]) + + for annotation_label in annotation_labels: + value = "" + for post_annotation in post_annotations: + if post_annotation.label == annotation_label: + value = post_annotation.value + if isinstance(value, list): + value = ",".join(value) + + # We're always adding an annotation value + # as an empty string, even if it's absent. + mapped_item.data[annotation_label] = value + # yield a DatasetItem, which is a dict with some special properties yield DatasetItem(mapper=item_mapper, original=original_item, mapped_object=mapped_item, **(mapped_item.get_item_data() if type(mapped_item) is MappedItem else mapped_item)) @@ -543,6 +570,7 @@ def delete(self, commit=True): pass # delete from database + self.delete_annotations() self.db.delete("datasets", where={"key": self.key}, commit=commit) self.db.delete("datasets_owners", where={"key": self.key}, commit=commit) self.db.delete("users_favourites", where={"key": self.key}, commit=commit) @@ -652,7 +680,7 @@ def get_owners_users(self, role="owner"): # owners that are owner by being part of a tag owners.extend(itertools.chain(*[tagged_owners for tag, tagged_owners in self.tagged_owners.items() if - role is None or self.owners[f"tag:{tag}"]["role"] == role])) + role is None or self.owners[f"tag:{tag}"]["role"] == role])) # de-duplicate before returning return set(owners) @@ -822,40 +850,12 @@ def get_columns(self): # Filetype not CSV or an NDJSON with `map_item` return [] - def get_annotation_fields(self): - """ - Retrieves the saved annotation fields for this dataset. - :return dict: The saved annotation fields. - """ - - annotation_fields = self.db.fetchone("SELECT annotation_fields FROM datasets WHERE key = %s;", (self.top_parent().key,)) - - if annotation_fields and annotation_fields.get("annotation_fields"): - annotation_fields = json.loads(annotation_fields["annotation_fields"]) - else: - annotation_fields = {} - - return annotation_fields - - def get_annotations(self): - """ - Retrieves the annotations for this dataset. - return dict: The annotations - """ - - annotations = self.db.fetchone("SELECT annotations FROM annotations WHERE key = %s;", (self.top_parent().key,)) - - if annotations and annotations.get("annotations"): - return json.loads(annotations["annotations"]) - else: - return None - def update_label(self, label): """ Update label for this dataset - :param str label: New label - :return str: The new label, as returned by get_label + :param str label: New label + :return str: The new label, as returned by get_label """ self.parameters["label"] = label @@ -993,7 +993,7 @@ def get_key(self, query, parameters, parent="", time_offset=0): parent_key = str(parent) if parent else "" plain_key = repr(param_key) + str(query) + parent_key - hashed_key = hashlib.md5(plain_key.encode("utf-8")).hexdigest() + hashed_key = hash_to_md5(plain_key) if self.db.fetchone("SELECT key FROM datasets WHERE key = %s", (hashed_key,)): # key exists, generate a new one @@ -1334,7 +1334,7 @@ def get_place_in_queue(self, update=False): Determine dataset's position in queue If the dataset is already finished, the position is -1. Else, the - position is the amount of datasets to be completed before this one will + position is the number of datasets to be completed before this one will be processed. A position of 0 would mean that the dataset is currently being executed, or that the backend is not running. @@ -1610,7 +1610,7 @@ def get_result_url(self): """ filename = self.get_results_path().name url_to_file = ('https://' if config.get("flask.https") else 'http://') + \ - config.get("flask.server_name") + '/result/' + filename + config.get("flask.server_name") + '/result/' + filename return url_to_file def warn_unmappable_item(self, item_count, processor=None, error_message=None, warn_admins=True): @@ -1637,6 +1637,276 @@ def warn_unmappable_item(self, item_count, processor=None, error_message=None, w # No other log available raise DataSetException(f"Unable to map item {item_count} for dataset {closest_dataset.key} and properly warn") + # Annotation functions (most of it is handled in Annotations) + def has_annotations(self) -> bool: + """ + Whether this dataset has annotations + """ + + annotation = self.db.fetchone("SELECT * FROM annotations WHERE dataset = %s;", (self.key,)) + + return True if annotation else False + + def get_annotations(self, item_id=[]) -> list: + """ + Retrieves the annotations for this dataset. + + :param item_id: A list of item IDs to get the annotations from. + May also be a string or int to get a specific annotation. + If left empty, get all the annotations for this dataset. + + return list: List of Annotation objects. + """ + + annotations = [] + + # Get annotation IDs first + if item_id: + # Cast to string + if isinstance(item_id, str) or isinstance(item_id, int): + item_id = [item_id] + item_id = [str(i) for i in item_id] + ids = self.db.fetchall("SELECT id FROM annotations WHERE dataset = %s AND item_id IN %s;", + (self.key, tuple(item_id))) + else: + # Else just get all the annotation data from this dataset + ids = self.db.fetchall("SELECT id FROM annotations WHERE dataset = %s;", (self.key,)) + + if not ids: + return [] + + # Then get the annotations by ID + ids = [i["id"] for i in ids] + for annotation_id in ids: + annotations.append(Annotation(annotation_id=annotation_id, db=self.db)) + + return annotations + + def has_annotation_fields(self) -> bool: + """ + Returns True if there's annotation fields saved tot the dataset table + """ + + annotation_fields = self.get_annotation_fields() + + return True if annotation_fields else False + + def get_annotation_fields(self) -> dict: + """ + Retrieves the saved annotation fields for this dataset. + These are stored in the annotations table. + + :return dict: The saved annotation fields. + """ + + annotation_fields = self.db.fetchone("SELECT annotation_fields FROM datasets WHERE key = %s;", (self.key,)) + + if annotation_fields and annotation_fields.get("annotation_fields"): + annotation_fields = json.loads(annotation_fields["annotation_fields"]) + else: + annotation_fields = {} + + return annotation_fields + + def get_annotation_field_labels(self) -> list: + """ + Retrieves the saved annotation field labels for this dataset. + These are stored in the annotations table. + + :return list: List of annotation field labels. + """ + + annotation_fields = self.get_annotation_fields() + + if not annotation_fields: + return [] + + labels = [v["label"] for v in annotation_fields.values()] + + return labels + + def save_annotations(self, annotations: list, overwrite=True) -> int: + """ + Takes a list of annotations and saves them to the annotations table. + If a field is not yet present in the `annotation_fields` column in + the datasets table, it also adds it there. + + :param list annotations: List of dictionaries with annotation items. Must have `item_id` and `label`. + E.g. [{"item_id": "12345", "label": "Valid", "value": "Yes"}] + :param bool overwrite: Whether to overwrite the annotation if it already present. + + :returns int: How many annotations were saved. + + """ + + if not annotations: + return 0 + + count = 0 + annotation_fields = self.get_annotation_fields() + annotation_labels = self.get_annotation_field_labels() + + field_id = "" + salt = str(random.randrange(0, 1000000)) + + # Add some dataset data to annotations, if not present + for annotation_data in annotations: + + # Check if the required fields are present + if "item_id" not in annotation_data: + raise AnnotationException("Can't save annotations; annotation must have an `item_id` referencing " + "the item it annotated, got %s" % annotation_data) + if "label" not in annotation_data or not isinstance(annotation_data["label"], str): + raise AnnotationException("Can't save annotations; annotation must have a `label` field, " + "got %s" % annotation_data) + if not overwrite and annotation_data["label"] in annotation_labels: + raise AnnotationException("Can't save annotations; annotation field with label %s " + "already exists" % annotation_data["label"]) + + # Set dataset key + if not annotation_data.get("dataset"): + annotation_data["dataset"] = self.key + + # Set default author to this dataset owner + # If this annotation is made by a processor, it will have the processor name + if not annotation_data.get("author"): + annotation_data["author"] = self.get_owners()[0] + + # The field ID can already exists for the same dataset/key combo, + # if a previous label has been renamed. + # If we're not overwriting, create a new key with some salt. + if not overwrite: + if not field_id: + field_id = hash_to_md5(annotation_data["dataset"] + annotation_data["label"] + salt) + if field_id in annotation_fields: + annotation_data["field_id"] = field_id + + # Create Annotation object, which also saves it to the database + # If this dataset/item ID/label combination already exists, this retrieves the + # existing data and updates it with new values. + annotation = Annotation(data=annotation_data, db=self.db) + + # Add data on the type of annotation field, if it is not saved to the datasets table yet. + # For now this is just a simple dict with a field ID, type, label, and possible options. + if not annotation_fields or annotation.field_id not in annotation_fields: + annotation_fields[annotation.field_id] = { + "label": annotation.label, + "type": annotation.type # Defaults to `text` + } + if annotation.options: + annotation_fields[annotation.options] = annotation.options + + count += 1 + + # Save annotation fields if things changed + if annotation_fields != self.get_annotation_fields(): + self.save_annotation_fields(annotation_fields) + + return count + + def delete_annotations(self, id=None, field_id=None): + """ + Deletes all annotations for an entire dataset. + If `id` or `field_id` are also given, it only deletes those annotations for this dataset. + + :param li id: A list or string of unique annotation IDs. + :param li field_id: A list or string of IDs for annotation fields. + + :return int: The number of removed records. + """ + + where = {"dataset": self.key} + + if id: + where["id"] = id + if field_id: + where["field_id"] = field_id + + return self.db.delete("annotations", where) + + def save_annotation_fields(self, new_fields: dict, add=False) -> int: + """ + Save annotation field data to the datasets table (in the `annotation_fields` column). + If changes to the annotation fields affect existing annotations, + this function will also call `update_annotations_via_fields()` to change them. + + :param dict new_fields: New annotation fields, with a field ID as key. + + :param bool add: Whether we're merely adding new fields + or replacing the whole batch. If add is False, + `new_fields` should contain all fields. + + :return int: The number of annotation fields saved. + + """ + + # Get existing annotation fields to see if stuff changed. + old_fields = self.get_annotation_fields() + changes = False + + # Do some validation + # Annotation field must be valid JSON. + try: + json.dumps(new_fields) + except ValueError: + raise AnnotationException("Can't save annotation fields: not valid JSON (%s)" % new_fields) + + # No duplicate IDs + if len(new_fields) != len(set(new_fields)): + raise AnnotationException("Can't save annotation fields: field IDs must be unique") + + # Annotation fields must at minimum have `type` and `label` keys. + seen_labels = [] + for field_id, annotation_field in new_fields.items(): + if not isinstance(field_id, str): + raise AnnotationException("Can't save annotation fields: field ID %s is not a valid string" % field_id) + if "label" not in annotation_field: + raise AnnotationException("Can't save annotation fields: all fields must have a label" % field_id) + if "type" not in annotation_field: + raise AnnotationException("Can't save annotation fields: all fields must have a type" % field_id) + if annotation_field["label"] in seen_labels: + raise AnnotationException("Can't save annotation fields: labels must be unique (%s)" % annotation_field["label"]) + seen_labels.append(annotation_field["label"]) + + # Keep track of whether existing fields have changed; if so, we're going to + # update the annotations table. + if field_id in old_fields: + if old_fields[field_id] != annotation_field: + changes = True + + # Check if fields are removed + if not add: + for field_id in old_fields.keys(): + if field_id not in new_fields: + changes = True + + # If we're just adding fields, add them to the old fields. + # If the field already exists, overwrite the old field. + if add and old_fields: + all_fields = old_fields + for field_id, annotation_field in new_fields.items(): + all_fields[field_id] = annotation_field + new_fields = all_fields + + # We're saving the new annotation fields as-is. + # Ordering of fields is preserved this way. + self.db.execute("UPDATE datasets SET annotation_fields = %s WHERE key = %s;", (json.dumps(new_fields), self.key)) + + # If anything changed with the annotation fields, possibly update + # existing annotations (e.g. to delete them or change their labels). + if changes: + Annotation.update_annotations_via_fields(self.key, old_fields, new_fields, self.db) + + return len(new_fields) + + def get_annotation_metadata(self) -> dict: + """ + Retrieves all the data for this dataset from the annotations table. + """ + + annotation_data = self.db.fetchall("SELECT * FROM annotations WHERE dataset = '%s';" % self.key) + return annotation_data + def __getattr__(self, attr): """ Getter so we don't have to use .data all the time diff --git a/common/lib/exceptions.py b/common/lib/exceptions.py index 01bd9813f..f187b4258 100644 --- a/common/lib/exceptions.py +++ b/common/lib/exceptions.py @@ -44,6 +44,11 @@ class ProcessorException(FourcatException): """ pass +class AnnotationException(FourcatException): + """ + Raise for exceptions with setting/getting annotations. + """ + pass class MapItemException(ProcessorException): """ diff --git a/common/lib/helpers.py b/common/lib/helpers.py index cd26d575f..1dcd3b27b 100644 --- a/common/lib/helpers.py +++ b/common/lib/helpers.py @@ -5,6 +5,7 @@ import imagehash import hashlib import requests +import hashlib import datetime import smtplib import fnmatch @@ -325,6 +326,22 @@ def convert_to_int(value, default=0): except (ValueError, TypeError): return default +def convert_to_float(value, default=0) -> float: + """ + Convert a value to a floating point, with a fallback + + The fallback is used if an Error is thrown during converstion to float. + This is a convenience function, but beats putting try-catches everywhere + we're using user input as a floating point number. + + :param value: Value to convert + :param int default: Default value, if conversion not possible + :return float: Converted value + """ + try: + return float(value) + except (ValueError, TypeError): + return default def timify_long(number): """ @@ -956,7 +973,7 @@ def flatten_dict(d: MutableMapping, parent_key: str = '', sep: str = '.'): Lists will be converted to json strings via json.dumps() :param MutableMapping d: Dictionary like object - :param str partent_key: The original parent key prepending future nested keys + :param str parent_key: The original parent key prepending future nested keys :param str sep: A seperator string used to combine parent and child keys :return dict: A new dictionary with the no nested values """ @@ -1040,3 +1057,9 @@ def folder_size(path='.'): elif entry.is_dir(): total += folder_size(entry.path) return total + +def hash_to_md5(string: str) -> str: + """ + Hash a string with an md5 hash. + """ + return hashlib.md5(string.encode("utf-8")).hexdigest() \ No newline at end of file diff --git a/common/lib/user_input.py b/common/lib/user_input.py index 63999083a..a2b3572a0 100644 --- a/common/lib/user_input.py +++ b/common/lib/user_input.py @@ -35,6 +35,7 @@ class UserInput: OPTION_FILE = "file" # file upload OPTION_HUE = "hue" # colour hue OPTION_DATASOURCES = "datasources" # data source toggling + OPTION_DATASOURCES_TABLE = "datasources_table" # a table with settings per data source OPTIONS_COSMETIC = (OPTION_INFO, OPTION_DIVIDER) @@ -143,6 +144,21 @@ def parse_all(options, input, silently_correct=True): parsed_input[option] = [datasource for datasource, v in datasources.items() if v["enabled"]] parsed_input[option.split(".")[0] + ".expiration"] = datasources + elif settings.get("type") == UserInput.OPTION_DATASOURCES_TABLE: + # special case, parse table values to generate a dict + columns = list(settings["columns"].keys()) + table_input = {} + + for datasource in list(settings["default"].keys()): + table_input[datasource] = {} + for column in columns: + + choice = input.get(option + "-" + datasource + "-" + column, False) + column_settings = settings["columns"][column] # sub-settings per column + table_input[datasource][column] = UserInput.parse_value(column_settings, choice, table_input, silently_correct=True) + + parsed_input[option] = table_input + elif option not in input: # not provided? use default parsed_input[option] = settings.get("default", None) diff --git a/datasources/dmi-tcat/explorer/dmi-tcat-explorer.json b/datasources/dmi-tcat/explorer/dmi-tcat-explorer.json deleted file mode 100644 index 017323e88..000000000 --- a/datasources/dmi-tcat/explorer/dmi-tcat-explorer.json +++ /dev/null @@ -1,92 +0,0 @@ -{ - "ndjson": { - "profile_picture": "", - "author": "{{ author_user.name }}", - "created": "{{ created_at }}", - "likes": "{{ public_metrics.like_count }} ", - "retweets": "{{ public_metrics.retweet_count }} ", - "replies": "{{ public_metrics.reply_count }} ", - "external_url": "https://twitter.com/{{ author_user.username }}/status/{{ id }}", - "image": "{{ attachments.media_keys.url }}", - "body": "{{ text }}", - "sort_options": [ - { - "key": "created_at", - "label": "Old to new" - }, - { - "key": "created_at", - "label": "New to old", - "descending": true - }, - { - "key": "id", - "label": "Post id", - "force_int": true - }, - { - "key": "public_metrics.like_count", - "label": "Most likes", - "descending": true, - "force_int": true - }, - { - "key": "public_metrics.retweet_count", - "label": "Most retweets", - "descending": true, - "force_int": true - }, - { - "key": "public_metrics.reply_count", - "label": "Most replies", - "descending": true, - "force_int": true - } - ] - }, - "csv": { - "likes": "{{ like_count }} ", - "retweets": "{{ retweet_count }} ", - "replies": "{{ reply_count }} ", - "external_url": "https://twitter.com/{{ author }}/status/{{ id }}", - "images": "{{ images }}", - "body": "{{ body }}", - "sort_options": [ - { - "key": "unix_timestamp", - "label": "Old to new", - "force_int": true - }, - { - "key": "unix_timestamp", - "label": "New to old", - "descending": true, - "force_int": true - }, - { - "key": "id", - "label": "Post id", - "force_int": true, - "force_int": true - }, - { - "key": "like_count", - "label": "Most likes", - "descending": true, - "force_int": true - }, - { - "key": "retweet_count", - "label": "Most retweets", - "descending": true, - "force_int": true - }, - { - "key": "reply_count", - "label": "Most replies", - "descending": true, - "force_int": true - } - ] - } -} \ No newline at end of file diff --git a/datasources/dmi-tcatv2/explorer/dmi-tcat-explorer.css b/datasources/dmi-tcatv2/explorer/dmi-tcat-explorer.css deleted file mode 100644 index 86bf76e27..000000000 --- a/datasources/dmi-tcatv2/explorer/dmi-tcat-explorer.css +++ /dev/null @@ -1,84 +0,0 @@ -/* - -See https://github.com/digitalmethodsinitiative/4cat/wiki/Exploring-and-annotating-datasets for information on how to add custom CSS. - - */ - -body { - background-color: white; -} - -.posts .post { - position: relative; - background-color: white; - max-width: 620px; - border: 1px solid #d6d6d6; - border-radius: 10px; - min-height: 50px; -} - -.posts .post header { - display: inline-block; - line-height: 1.7em; - margin-bottom: 5px; - border: none; - color: rgb(104, 119, 130); -} - -.posts .post header .post_id { - display: none; -} - -.posts .post header .author { - color: black; -} - -.posts .post header .profile_picture { - float: left; - margin-right: 15px; -} - -.posts .post header .profile_picture img { - border-radius: 100px; - width: 50px; -} - -.posts .post header .profile_picture:after { - display: none; -} - -.posts .post article { - margin: 0; - padding: 0; -} - -.posts .post.op { - background-color: white; - color: black; -} - -.posts .post .post-content { - display: inline-block; -} - -.posts .post .post-image { - margin-bottom: 10px; -} - -.posts .post .post-image img { - border-radius: 10px; -} - -.posts .external-url { - color: rgb(104, 119, 130); -} - -.posts .post.op .post-annotations, .posts .post .post-annotations { - border-radius: 10px; - background-color: rgb(241, 249, 255); - color: #474747; -} - -span.hashtag { - color: rgb(29, 155, 240); -} \ No newline at end of file diff --git a/datasources/dmi-tcatv2/explorer/dmi-tcat-explorer.json b/datasources/dmi-tcatv2/explorer/dmi-tcat-explorer.json deleted file mode 100644 index 017323e88..000000000 --- a/datasources/dmi-tcatv2/explorer/dmi-tcat-explorer.json +++ /dev/null @@ -1,92 +0,0 @@ -{ - "ndjson": { - "profile_picture": "", - "author": "{{ author_user.name }}", - "created": "{{ created_at }}", - "likes": "{{ public_metrics.like_count }} ", - "retweets": "{{ public_metrics.retweet_count }} ", - "replies": "{{ public_metrics.reply_count }} ", - "external_url": "https://twitter.com/{{ author_user.username }}/status/{{ id }}", - "image": "{{ attachments.media_keys.url }}", - "body": "{{ text }}", - "sort_options": [ - { - "key": "created_at", - "label": "Old to new" - }, - { - "key": "created_at", - "label": "New to old", - "descending": true - }, - { - "key": "id", - "label": "Post id", - "force_int": true - }, - { - "key": "public_metrics.like_count", - "label": "Most likes", - "descending": true, - "force_int": true - }, - { - "key": "public_metrics.retweet_count", - "label": "Most retweets", - "descending": true, - "force_int": true - }, - { - "key": "public_metrics.reply_count", - "label": "Most replies", - "descending": true, - "force_int": true - } - ] - }, - "csv": { - "likes": "{{ like_count }} ", - "retweets": "{{ retweet_count }} ", - "replies": "{{ reply_count }} ", - "external_url": "https://twitter.com/{{ author }}/status/{{ id }}", - "images": "{{ images }}", - "body": "{{ body }}", - "sort_options": [ - { - "key": "unix_timestamp", - "label": "Old to new", - "force_int": true - }, - { - "key": "unix_timestamp", - "label": "New to old", - "descending": true, - "force_int": true - }, - { - "key": "id", - "label": "Post id", - "force_int": true, - "force_int": true - }, - { - "key": "like_count", - "label": "Most likes", - "descending": true, - "force_int": true - }, - { - "key": "retweet_count", - "label": "Most retweets", - "descending": true, - "force_int": true - }, - { - "key": "reply_count", - "label": "Most replies", - "descending": true, - "force_int": true - } - ] - } -} \ No newline at end of file diff --git a/datasources/douyin/explorer/douyin-explorer.json b/datasources/douyin/explorer/douyin-explorer.json deleted file mode 100644 index 3735aa0bf..000000000 --- a/datasources/douyin/explorer/douyin-explorer.json +++ /dev/null @@ -1,44 +0,0 @@ -{ - "ndjson": { - "author": "{{ author.nickname }}", - "created": "{{ create_time | datetime }}", - "body": "
{{ desc }}", - "external_url": "{{ share_url }}", - "plays": " {{ statistics.play_count | numberify }}", - "likes": " {{ statistics.digg_count | numberify }}", - "comments": " {{ statistics.comment_count | numberify }}", - "shares": " {{ statistics.share_count | numberify }}", - - "sort_options": [ - { - "key": "create_time", - "label": "Old to new" - }, - { - "key": "create_time", - "label": "New to old", - "descending": true - }, - { - "key": "statistics.play_count", - "label": "Plays", - "descending": true - }, - { - "key": "statistics.digg_count", - "label": "Likes", - "descending": true - }, - { - "key": "statistics.comment_count", - "label": "Comments", - "descending": true - }, - { - "key": "statistics.share_count", - "label": "Shares", - "descending": true - } - ] - } -} \ No newline at end of file diff --git a/datasources/douyin/search_douyin.py b/datasources/douyin/search_douyin.py index 12768196c..4535eb47f 100644 --- a/datasources/douyin/search_douyin.py +++ b/datasources/douyin/search_douyin.py @@ -7,6 +7,7 @@ from datetime import datetime from backend.lib.search import Search +from common.lib.helpers import UserInput from common.lib.item_mapping import MappedItem, MissingMappedField class SearchDouyin(Search): @@ -26,7 +27,7 @@ class SearchDouyin(Search): "[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)", "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)" ] - + def get_items(self, query): """ Run custom search diff --git a/datasources/fourchan/explorer/fourchan-explorer.json b/datasources/fourchan/explorer/fourchan-explorer.json deleted file mode 100644 index 9ba81e454..000000000 --- a/datasources/fourchan/explorer/fourchan-explorer.json +++ /dev/null @@ -1,33 +0,0 @@ -{ - "subject": "{{ subject }}", - "board": "/{{ board }}/", - "image": "{{ image_4chan | 4chan_image(id, board, image_md5) }}", - "country": "", - "deleted": "", - "external_url": "https://archive.4plebs.org/{{ board }}/thread/{{ thread_id }}#{{ id }}", - "sort_options": [ - { - "key": "timestamp", - "label": "Old to new" - }, - { - "key": "timestamp", - "label": "New to old", - "descending": true - }, - { - "key": "id", - "label": "Post id", - "force_int": true - }, - { - "key": "thread_id", - "label": "Thread id", - "force_int": true - }, - { - "key": "country_code", - "label": "Country" - } - ] -} \ No newline at end of file diff --git a/datasources/fourchan/search_4chan.py b/datasources/fourchan/search_4chan.py index 17694badc..7b69b872e 100644 --- a/datasources/fourchan/search_4chan.py +++ b/datasources/fourchan/search_4chan.py @@ -442,7 +442,7 @@ class Search4Chan(SearchWithScope): "help": "Can query without keyword", "default": False, "tooltip": "Allows users to query the 4chan data without specifying a keyword. This can lead to HUGE datasets!" - }, + } } def get_items_simple(self, query): diff --git a/datasources/imgur/search_imgur.py b/datasources/imgur/search_imgur.py index b8c80ec5b..198f22491 100644 --- a/datasources/imgur/search_imgur.py +++ b/datasources/imgur/search_imgur.py @@ -8,8 +8,9 @@ from backend.lib.search import Search from common.lib.item_mapping import MappedItem +from common.lib.helpers import UserInput -class SearchNineGag(Search): +class SearchImgur(Search): """ Import scraped Imgur data """ diff --git a/datasources/instagram/explorer/instagram-explorer.css b/datasources/instagram/explorer/instagram-explorer.css deleted file mode 100644 index 63bc05fb7..000000000 --- a/datasources/instagram/explorer/instagram-explorer.css +++ /dev/null @@ -1,34 +0,0 @@ -* { - color: black; -} - -h1 span { - color: white; -} - -body { - background-color: white; -} - -.posts li.post { - max-width: 225px; - background-color: white; - font-family: "Segoe UI", Roboto, Helvetica, Arial, sans-serif; - font-size: 14px; - border-bottom: 1px solid grey; -} - -.posts header { - border: none; -} - -.posts .alt, .posts .alt time { - color: grey; -} - -.posts .post-image { - max-width: 200px; - margin: 0 auto; - margin-top: 30px; - margin-bottom: 30px; -} \ No newline at end of file diff --git a/datasources/instagram/explorer/instagram-explorer.json b/datasources/instagram/explorer/instagram-explorer.json deleted file mode 100644 index 9e5935297..000000000 --- a/datasources/instagram/explorer/instagram-explorer.json +++ /dev/null @@ -1,33 +0,0 @@ -{ - "ndjson": { - "author": "{{ user.full_name }}", - "body": "{{ caption.text }}", - "image": "retrieve:{{ image_versions2.candidates.url }}", - "likes": "{{ like_count }} likes", - "comments": "{{ comment_count }} comments", - "date": "{{ taken_at | datetime }}", - "external_url": "https://instagram.com/p/{{ code }}", - "type": "{{ product_type }}", - "sort_options": [ - { - "key": "taken_at", - "label": "Old to new" - }, - { - "key": "taken_at", - "label": "New to old", - "descending": true - }, - { - "key": "like_count", - "label": "Likes", - "descending": true - }, - { - "key": "stats.commentCount", - "label": "Comments", - "descending": true - } - ] - } -} \ No newline at end of file diff --git a/datasources/instagram/search_instagram.py b/datasources/instagram/search_instagram.py index 3a3b76f4c..d9749c29c 100644 --- a/datasources/instagram/search_instagram.py +++ b/datasources/instagram/search_instagram.py @@ -113,10 +113,11 @@ def parse_graph_item(node): media_types = set([s["node"]["__typename"] for s in node["edge_sidecar_to_children"]["edges"]]) media_type = "mixed" if len(media_types) > 1 else type_map.get(media_types.pop(), "unknown") - location = {"name": MissingMappedField(""), "latlong": MissingMappedField(""), "city": MissingMappedField("")} + location = {"name": MissingMappedField(""), "location_id": MissingMappedField(""), "latlong": MissingMappedField(""), "city": MissingMappedField("")} # location has 'id', 'has_public_page', 'name', and 'slug' keys in tested examples; no lat long or "city" though name seems if node.get("location"): location["name"] = node["location"].get("name") + location["location_id"] = node["location"].get("pk") # Leaving this though it does not appear to be used in this type; maybe we'll be surprised in the future... location["latlong"] = str(node["location"]["lat"]) + "," + str(node["location"]["lng"]) if node[ "location"].get("lat") else "" @@ -137,6 +138,7 @@ def parse_graph_item(node): "timestamp": datetime.datetime.fromtimestamp(node["taken_at_timestamp"]).strftime("%Y-%m-%d %H:%M:%S"), "author": user.get("username", owner.get("username", MissingMappedField(""))), "author_fullname": user.get("full_name", owner.get("full_name", MissingMappedField(""))), + "is_verified": True if user.get("is_verified") else False, "author_avatar_url": user.get("profile_pic_url", owner.get("profile_pic_url", MissingMappedField(""))), "type": media_type, "url": "https://www.instagram.com/p/" + node["shortcode"], @@ -149,6 +151,7 @@ def parse_graph_item(node): "num_comments": node.get("edge_media_preview_comment", {}).get("count", 0), "num_media": num_media, "location_name": location["name"], + "location_id": location["location_id"], "location_latlong": location["latlong"], "location_city": location["city"], "unix_timestamp": node["taken_at_timestamp"] @@ -205,9 +208,10 @@ def parse_itemlist_item(node): else: num_comments = -1 - location = {"name": MissingMappedField(""), "latlong": MissingMappedField(""), "city": MissingMappedField("")} + location = {"name": MissingMappedField(""), "location_id": MissingMappedField(""), "latlong": MissingMappedField(""), "city": MissingMappedField("")} if node.get("location"): location["name"] = node["location"].get("name") + location["location_id"] = node["location"].get("pk") location["latlong"] = str(node["location"]["lat"]) + "," + str(node["location"]["lng"]) if node[ "location"].get("lat") else "" location["city"] = node["location"].get("city") @@ -218,6 +222,14 @@ def parse_itemlist_item(node): if user.get("username") != owner.get("username"): raise MapItemException("Unable to parse item: different user and owner") + # Instagram posts also allow 'Collabs' with up to one co-author + coauthor = {"coauthor": "", "coauthor_fullname": "", "coauthor_id": ""} + if node.get("coauthor_producers"): + coauthor_node = node["coauthor_producers"][0] + coauthor["coauthor"] = coauthor_node.get("username") + coauthor["coauthor_fullname"] = coauthor_node.get("full_name") + coauthor["coauthor_id"] = coauthor_node.get("id") + mapped_item = { "id": node["code"], "post_source_domain": node.get("__import_meta", {}).get("source_platform_url"), # Zeeschuimer metadata @@ -226,7 +238,11 @@ def parse_itemlist_item(node): "body": caption, "author": user.get("username", owner.get("username", MissingMappedField(""))), "author_fullname": user.get("full_name", owner.get("full_name", MissingMappedField(""))), + "verified": True if user.get("is_verified") else False, "author_avatar_url": user.get("profile_pic_url", owner.get("profile_pic_url", MissingMappedField(""))), + "coauthor": coauthor["coauthor"], + "coauthor_fullname": coauthor["coauthor_fullname"], + "coauthor_id": coauthor["coauthor_id"], "timestamp": datetime.datetime.fromtimestamp(node["taken_at"]).strftime("%Y-%m-%d %H:%M:%S"), "type": media_type, "url": "https://www.instagram.com/p/" + node["code"], @@ -239,6 +255,7 @@ def parse_itemlist_item(node): "num_comments": num_comments, "num_media": num_media, "location_name": location["name"], + "location_id": location["location_id"], "location_latlong": location["latlong"], "location_city": location["city"], "unix_timestamp": node["taken_at"] diff --git a/datasources/linkedin/search_linkedin.py b/datasources/linkedin/search_linkedin.py index a8380b4d8..dca1ce127 100644 --- a/datasources/linkedin/search_linkedin.py +++ b/datasources/linkedin/search_linkedin.py @@ -84,6 +84,18 @@ def map_item(item): elif image and image.get("artifacts"): images.append(image["rootUrl"] + image["artifacts"][0]["fileIdentifyingUrlPathSegment"]) + # video thumbnails are stored similarly as image data + video_thumb_url = "" + thumb_content = None + if item["content"] and "*videoPlayMetadata" in item["content"]: + thumb_content = item["content"]["*videoPlayMetadata"]["thumbnail"] + elif item["content"] and "linkedInVideoComponent" in item["content"] and item["content"]["linkedInVideoComponent"]: + thumb_content = item["content"]["linkedInVideoComponent"]["*videoPlayMetadata"]["thumbnail"] + elif item["content"] and "externalVideoComponent" in item["content"] and item["content"]["externalVideoComponent"]: + thumb_content = item["content"]["externalVideoComponent"]["*videoPlayMetadata"]["thumbnail"] + if thumb_content: + video_thumb_url = thumb_content["rootUrl"] + thumb_content["artifacts"][0]["fileIdentifyingUrlPathSegment"] + author = SearchLinkedIn.get_author(item) # the ID is in the format 'urn:li:activity:6960882777168695296' @@ -103,18 +115,43 @@ def map_item(item): elif item["commentary"] and "attributesV2" in item["commentary"]["text"]: hashtags = [tag["detailData"]["*hashtag"]["trackingUrn"].split(":").pop() for tag in item["commentary"]["text"].get("attributesV2", []) if "*hashtag" in tag["detailData"]] + # and mentions + # we're storing both usernames and full names + author_mentions = [] + author_name_mentions = [] + if item["commentary"] and "attributes" in item["commentary"]["text"]: + for mention in item["commentary"]["text"].get("attributes", {}): + if mention["type"] == "PROFILE_MENTION": + mention = mention["*miniProfile"] + author_mentions.append(mention["publicIdentifier"]) + author_name_mentions.append(" ".join([mention.get("firstName", ""), mention.get("lastName", "")])) + elif mention["type"] == "COMPANY_NAME": + mention = mention["*miniCompany"] + author_mentions.append(mention["universalName"]) + author_name_mentions.append(mention.get("name", "")) + # same for metrics if "*totalSocialActivityCounts" in item["*socialDetail"]: metrics = { - "likes": item["*socialDetail"]["*totalSocialActivityCounts"]["numLikes"], "comments": item["*socialDetail"]["*totalSocialActivityCounts"]["numComments"], - "shares": item["*socialDetail"]["*totalSocialActivityCounts"]["numShares"] - } + "shares": item["*socialDetail"]["*totalSocialActivityCounts"]["numShares"], + "reactions": item["*socialDetail"]["*totalSocialActivityCounts"]["numLikes"], + "reaction_like": 0, + "reaction_empathy": 0, + "reaction_praise": 0, + "reaction_entertainment": 0, + "reaction_appreciation": 0, + "reaction_interest": 0 + } + # There's different kind of reaction metrics + for reaction_type in item["*socialDetail"]["*totalSocialActivityCounts"].get("reactionTypeCounts", []): + metrics["reaction_" + reaction_type["reactionType"].lower()] = reaction_type["count"] + else: metrics = { - "likes": item["*socialDetail"]["likes"]["paging"]["total"], "comments": item["*socialDetail"]["comments"]["paging"]["total"], "shares": item["*socialDetail"]["totalShares"], + "reactions": item["*socialDetail"]["likes"]["paging"]["total"] } # and links @@ -133,8 +170,11 @@ def map_item(item): "timestamp_ago": time_ago.split("•")[0].strip(), "is_promoted": "yes" if not re.findall(r"[0-9]", time_ago) else "no", **{("author_" + k).replace("_username", ""): v for k, v in author.items()}, + "author_mentions": ",".join(author_mentions), + "author_name_mentions": ",".join(author_name_mentions), "hashtags": ",".join(hashtags), "image_urls": ",".join(images), + "video_thumb_url": video_thumb_url, "post_url": "https://www.linkedin.com/feed/update/" + urn, "link_url": link_url, **metrics, diff --git a/datasources/reddit/explorer/reddit-explorer.json b/datasources/reddit/explorer/reddit-explorer.json deleted file mode 100644 index b70c85065..000000000 --- a/datasources/reddit/explorer/reddit-explorer.json +++ /dev/null @@ -1,41 +0,0 @@ -{ - "post_flair": "{{ post_flair }}", - "author": "{{ author }} {{author_flair}}", - "subreddit": "r/{{subreddit}}", - "score": " {{score}} ", - "external_url": "https://reddit.com/r/{{subreddit}}/comments/{{thread_id}}/comment/{{id}}", - "image": "{{ image_file }}", - "subject": "{{ subject }}", - "subject_url": "{{ domain }}", - "sort_options": [ - { - "key": "timestamp", - "label": "Old to new" - }, - { - "key": "timestamp", - "label": "New to old", - "descending": true - }, - { - "key": "id", - "label": "Post id" - }, - { - "key": "thread_id", - "label": "Thread id" - }, - { - "key": "score", - "label": "Score (highest to lowest)", - "descending": true, - "force_int": true - }, - { - "key": "score", - "label": "Score (lowest to highest)", - "force_int": true - } - ], - "markdown": true -} \ No newline at end of file diff --git a/datasources/telegram/search_telegram.py b/datasources/telegram/search_telegram.py index 9e523a247..aa2f5bca1 100644 --- a/datasources/telegram/search_telegram.py +++ b/datasources/telegram/search_telegram.py @@ -820,6 +820,12 @@ def map_item(message): # Failsafe; can be updated to support formatting of new datastructures in the future reactions += f"{reaction}, " + is_reply = False + reply_to = "" + if message.get("reply_to"): + is_reply = True + reply_to = message["reply_to"].get("reply_to_msg_id", "") + # t.me links linked_entities = set() all_links = ural.urls_from_text(message["message"]) @@ -861,7 +867,9 @@ def map_item(message): "author_name": fullname, "author_is_bot": "yes" if user_is_bot else "no", "body": message["message"], - "reply_to": message.get("reply_to_msg_id", ""), + "body_markdown": message["message_markdown"], + "is_reply": is_reply, + "reply_to": reply_to, "views": message["views"] if message["views"] else "", # "forwards": message.get("forwards", MissingMappedField(0)), "reactions": reactions, @@ -956,6 +964,11 @@ def serialize_obj(input_obj): # Add the _type if the original object was a telethon type if type(input_obj).__module__ in ("telethon.tl.types", "telethon.tl.custom.forward"): mapped_obj["_type"] = type(input_obj).__name__ + + # Store the markdown-formatted text + if type(input_obj).__name__ == "Message": + mapped_obj["message_markdown"] = input_obj.text + return mapped_obj @staticmethod diff --git a/datasources/tiktok/explorer/tiktok-explorer.css b/datasources/tiktok/explorer/tiktok-explorer.css deleted file mode 100644 index e295a248f..000000000 --- a/datasources/tiktok/explorer/tiktok-explorer.css +++ /dev/null @@ -1,23 +0,0 @@ -body { - color: black; -} - -.posts header span.post_id { - display: none; -} - -.posts li.post { - border-radius: 10px; - padding: 20px; - max-width: 200px; - background-color: white; - border-bottom: 1px grey; -} - - -.posts li.post .preview { - display: block; - margin: 0 auto; - max-width: 300px; -} - diff --git a/datasources/tiktok/explorer/tiktok-explorer.json b/datasources/tiktok/explorer/tiktok-explorer.json deleted file mode 100644 index 92724261f..000000000 --- a/datasources/tiktok/explorer/tiktok-explorer.json +++ /dev/null @@ -1,45 +0,0 @@ -{ - "ndjson": { - "tiktok-user": "@{{ author.uniqueId }}", - "external_url": "https://www.tiktok.com/@{{ author.uniqueId }}/video/{{ id }}", - "body": "
{{ desc }}", - "author": "{{ nickname }}", - "created": "{{ createTime | datetime }}", - "plays": " {{ stats.playCount | numberify }}", - "likes": " {{ stats.diggCount | numberify }}", - "comments": " {{ stats.commentCount | numberify }}", - "shares": " {{ stats.shareCount | numberify }}", - - "sort_options": [ - { - "key": "createTime", - "label": "Old to new" - }, - { - "key": "createTime", - "label": "New to old", - "descending": true - }, - { - "key": "stats.playCount", - "label": "Plays", - "descending": true - }, - { - "key": "stats.diggCount", - "label": "Likes", - "descending": true - }, - { - "key": "stats.commentCount", - "label": "Comments", - "descending": true - }, - { - "key": "stats.shareCount", - "label": "Shares", - "descending": true - } - ] - } -} \ No newline at end of file diff --git a/datasources/tiktok_urls/explorer/tiktok_urls-explorer.json b/datasources/tiktok_urls/explorer/tiktok_urls-explorer.json deleted file mode 100644 index f49132c04..000000000 --- a/datasources/tiktok_urls/explorer/tiktok_urls-explorer.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "ndjson": { - "post_id": "", - "external_url": "https://www.tiktok.com/@{{ author }}/video/{{ id }}", - "body": "
{{ desc }}", - "author": "{{ nickname }}", - "musicname": "{{ 'music_name': post['music']['title'] }}", - "created": "{{ createTime | datetime }}", - "plays": " {{ stats.playCount | numberify }}", - "likes": " {{ stats.diggCount | numberify }}", - "comments": " {{ stats.commentCount | numberify }}", - "shares": " {{ stats.shareCount | numberify }}", - - "sort_options": [ - { - "key": "createTime", - "label": "Old to new" - }, - { - "key": "timestamp", - "label": "New to old", - "descending": true - } - ] - } -} \ No newline at end of file diff --git a/datasources/truth/search_truth.py b/datasources/truth/search_truth.py index c1743e12c..b34703c4f 100644 --- a/datasources/truth/search_truth.py +++ b/datasources/truth/search_truth.py @@ -35,7 +35,7 @@ def map_item(post): """ Parse Truth Social post - :param node: Data as received from Truth Social + :param post: Data as received from Truth Social :return dict: Mapped item """ diff --git a/datasources/tumblr/DESCRIPTION.md b/datasources/tumblr/DESCRIPTION.md index a2be57d25..5100cb47f 100644 --- a/datasources/tumblr/DESCRIPTION.md +++ b/datasources/tumblr/DESCRIPTION.md @@ -1,5 +1,5 @@ The Tumblr data is retrieved by interfacing with the [Tumblr API](https://api.tumblr.com). -It is only possible to get posts by tag or per blog, since the API does not allow keyword search. +It is only possible to get posts by tag, per blog, or by individual posts, since the API does not allow keyword search. ### Privacy Be aware that the data may contain personal information. It is thus recommended to pseudonymise the data. @@ -7,14 +7,15 @@ Be aware that the data may contain personal information. It is thus recommended To comply with the Tumblr API requirements, Tumblr datasets are deleted after three days. ### Rate limits -4CAT uses an internal API key to get Tumblr posts. These are limited to the +If set, 4CAT uses an internal API key to get Tumblr posts. These are limited to the [following rate limits](https://www.tumblr.com/docs/en/api/v2#rate-limits). However, administrators may request a rate limit increase via Tumblr. +If no internal API key is set, you can insert your own. + ### Date bugs -The [Tumblr API](https://api.tumblr.com) is volatile: when fetching sporadically used -tags, it may return zero posts, even though older posts *do* exist. Check the oldest post in -your dataset to see if it this is indeed the case and whether any odd time gaps exists. +The [Tumblr API](https://api.tumblr.com) is volatile: when fetching content, it may return zero posts, even though older posts *do* exist. Check the oldest post in +your dataset to see if this is indeed the case and whether any odd time gaps exist. 4CAT tries to mitigate this by decreasing the date parameter (before) with six hours and sending the query again. This often successfully returns older, un-fetched posts. If it didn't find new data after checking 24 days in the past, it checks for data up to six years diff --git a/datasources/tumblr/explorer/tumblr-explorer.css b/datasources/tumblr/explorer/tumblr-explorer.css deleted file mode 100644 index a7b3df88d..000000000 --- a/datasources/tumblr/explorer/tumblr-explorer.css +++ /dev/null @@ -1,74 +0,0 @@ -/* - -See https://github.com/digitalmethodsinitiative/4cat/wiki/Exploring-and-annotating-datasets for information on how to add custom CSS. - - */ - -body { - background-color: #001935; -} - -#metadata, footer { - color: white; -} - -.content { - font-family: "Favorit", "Helvetica Neue", "HelveticaNeue", Helvetica, Arial, sans-serif; -} - -.posts li.post { - background-color: white; - color: black; - font-size: 14px; - left: 0; - border-radius: 3px; - max-width: 540px; - padding: 0; -} - -.posts li.post header { - display: inline-block; - text-decoration: none; - font-weight: bold; - border: none; - padding: 0px; - line-height: 1.7em; - margin: 25px; - margin-bottom: 10px; -} - -.posts li.post article { - padding: 0; - margin: 0; -} - -.posts li.post .post-content { - display: block; - margin: 25px; - margin-top: 0px; -} - -.posts li.post .post-tags { - color: #5e5e5e; - margin-top: 20px; - word-break: break-all; -} - -.posts li.post .author { - font-weight: bold; -} - -.posts li.post .post-image { - width: 100%; - margin-bottom: 15px; -} - -.posts li.post .external-url { - -} - -.posts li.post .post-annotations { - background-color: white; - border-top: 1px solid #5e5e5e; - margin-right: 0; -} \ No newline at end of file diff --git a/datasources/tumblr/explorer/tumblr-explorer.json b/datasources/tumblr/explorer/tumblr-explorer.json deleted file mode 100644 index d5ad04b51..000000000 --- a/datasources/tumblr/explorer/tumblr-explorer.json +++ /dev/null @@ -1,28 +0,0 @@ -{ - "external_url": "https://{{author}}.tumblr.com/{{id}}", - "notes": "{{notes}} notes", - "reblog": "{{ is_reblog }}", - "images": "{{ images }}", - "body": "{{ body }}
{{tags}}
", - "sort_options": [ - { - "key": "timestamp", - "label": "Old to new" - }, - { - "key": "timestamp", - "label": "New to old", - "descending": true - }, - { - "key": "id", - "label": "Post id" - }, - { - "key": "notes", - "label": "Most notes", - "descending": true, - "force_int": true - } - ] -} \ No newline at end of file diff --git a/datasources/tumblr/search_tumblr.py b/datasources/tumblr/search_tumblr.py index 0ce4328dc..0b1b5bb78 100644 --- a/datasources/tumblr/search_tumblr.py +++ b/datasources/tumblr/search_tumblr.py @@ -1,24 +1,33 @@ """ Search Tumblr via its API -Can fetch posts from specific blogs or with specific hashtags +Can fetch posts from specific blogs or with specific tags + +For Tumblr API documentation, see https://www.tumblr.com/docs/en/api/v2 +For Neue Post Format documentation, see https://github.com/tumblr/docs/blob/master/npf-spec.md + """ import time import pytumblr +import requests +import re +import json from requests.exceptions import ConnectionError from datetime import datetime from common.config_manager import config from backend.lib.search import Search -from common.lib.helpers import UserInput +from common.lib.helpers import UserInput, strip_tags from common.lib.exceptions import QueryParametersException, ProcessorInterruptedException, ConfigException +from common.lib.item_mapping import MappedItem __author__ = "Sal Hagen" __credits__ = ["Sal Hagen", "Tumblr API (api.tumblr.com)"] __maintainer__ = "Sal Hagen" __email__ = "4cat@oilab.eu" + class SearchTumblr(Search): """ Tumblr data filter module. @@ -26,18 +35,21 @@ class SearchTumblr(Search): type = "tumblr-search" # job ID category = "Search" # category title = "Search Tumblr" # title displayed in UI - description = "Retrieve Tumblr posts by hashtag or blog." # description displayed in UI - extension = "csv" # extension of result file, used internally and in UI - is_local = False # Whether this datasource is locally scraped - is_static = False # Whether this datasource is still updated + description = "Retrieve Tumblr posts by tags or blogs." # description displayed in UI + extension = "ndjson" # extension of result file, used internally and in UI + is_local = False # Whether this datasource is locally scraped + is_static = False # Whether this datasource is still updated # not available as a processor for existing datasets accepts = [None] max_workers = 1 - max_retries = 3 # For API and connection retries. - max_date_retries = 96 + 150 # For checking dates. 96 time retries of -6 hours (24 days), plus 150 extra for 150 weeks (~3 years). + # For API and connection retries. + max_retries = 3 + # For checking dates. 96 retries of -6 hours (24 days), plus 150 extra for 150 weeks (~3 years). + max_date_retries = 96 + 150 max_posts = 1000000 + max_reblogs = 1000 max_posts_reached = False api_limit_reached = False @@ -45,7 +57,7 @@ class SearchTumblr(Search): seen_ids = set() client = None failed_notes = [] - failed_reblogs = [] + failed_posts = [] config = { # Tumblr API keys to use for data capturing @@ -72,57 +84,79 @@ class SearchTumblr(Search): 'default': "", 'help': 'Tumblr API Secret Key', 'tooltip': "", - }, + } } references = ["[Tumblr API documentation](https://www.tumblr.com/docs/en/api/v2)"] @classmethod def get_options(cls, parent_dataset=None, user=None): """ - Check is Tumbler keys configured and if not, requests from User + Check if Tumblr keys configured and if not, requests from User """ options = { "intro": { "type": UserInput.OPTION_INFO, "help": "Retrieve any kind of Tumblr posts with specific tags or from specific blogs. Gets 100.000 posts " - "at max. Insert tags or names of blogs, one on each line. You may insert up to ten tags or " - "blogs.\n\nTumblr tags may include whitespace and commas. A `#` before the tag is optional.\n\n" - "Tag search only get posts explicitly associated with the exact tag you insert here. Querying " - "`gogh` will thus not get posts only tagged with `van gogh`. Keyword search is unfortunately not " - "allowed by the [Tumblr API](https://api.tumblr.com).\n\nIf 4CAT reached its Tumblr API rate " - "limit, try again 24 hours later." + "at max. You may insert up to ten tags or blogs.\n\n" + "*Tag-level search only returns original posts*. " + "Reblogs of tagged posts can be retrieved via the options below. Blog-level search also returns reblogs.\n\n" + "Tag search only get posts with the exact tag you insert. Querying " + "`gogh` will not get posts tagged with `van gogh`.\n\n" + "A `#` before a tag is optional. Blog names must start with `@`.\n\n" + "Individual posts can be captured by inserting their URL or via the format `@blogname:post_id`.\n\n" + "Keyword search is not allowed by the [Tumblr API](https://api.tumblr.com).\n\n" + "If this 4CAT reached its Tumblr API rate limit, try again 24 hours later." }, - "search_scope": { + "query": { + "type": UserInput.OPTION_TEXT_LARGE, + "help": "Tags, blogs, or post URLs.", + "tooltip": "Seperate with comma or newline, e.g.: #research tools, @4catblog, https://tumblr.com/4catblog/123456789" + }, + "get_notes": { + "type": UserInput.OPTION_TOGGLE, + "help": "Add note data (warning: slow)", + "tooltip": "Add note data for every post. This includes note metrics, " + "replies, reblogged text, and reblogged images. " + "Blog- and id-level search includes reblogged text by default. " + "Limited to the first 1,000 reblogs per post.", + "default": False + }, + "get_reblogs": { + "type": UserInput.OPTION_TOGGLE, + "help": "Add reblogs", + "tooltip": "Add reblogs of initially captured posts as new posts to the dataset. ", + "requires": "get_notes==true", + "default": False + }, + "reblog_type": { "type": UserInput.OPTION_CHOICE, - "help": "Search by", + "help": "Reblogs to add", "options": { - "tag": "Tag", - "blog": "Blog" + "text": "Only with added text", + "text_or_tag": "Only with added text and/or added tags (slow)" }, - "default": "tag" + "tooltip": "What type of reblogs to add to the dataset.", + "requires": "get_reblogs==true", + "default": "text" }, - "query": { - "type": UserInput.OPTION_TEXT_LARGE, - "help": "Tags/blogs", - "tooltip": "Separate with commas or new lines." - }, - "fetch_reblogs": { + "reblog_outside_daterange": { "type": UserInput.OPTION_TOGGLE, - "help": "Also fetch reblogs with text? (warning: slow)", + "help": "Retain reblogs outside of date range", + "requires": "get_reblogs==true", "default": False } } try: - config_keys = SearchTumblr.get_tumbler_keys(user) + SearchTumblr.get_tumblr_keys(user) except ConfigException: # No 4CAT set keys for user; let user input their own options["key-info"] = { "type": UserInput.OPTION_INFO, - "help": "In order to access the Tumblr API, you need to register an application. You can do so " - "[here](https://www.tumblr.com/oauth/apps) and use the keys below. You will first get the OAuth " + "help": "To access the Tumblr API, you need to register an application. You can do so " + "[here](https://www.tumblr.com/oauth/apps). You will first get the OAuth " "Consumer Key and Secret, and then the User Token Key and Secret [after entering them here](ht" - "tps://api.tumblr.com/console/calls/user/info) and granting access." + "tps://api.tumblr.com/console/calls/user/info) and granting access." } options["consumer_key"] = { "type": UserInput.OPTION_TEXT, @@ -150,22 +184,24 @@ def get_options(cls, parent_dataset=None, user=None): } options["divider"] = { - "type": UserInput.OPTION_DIVIDER - } + "type": UserInput.OPTION_DIVIDER + } options["date-intro"] = { - "type": UserInput.OPTION_INFO, - "help": "**Note:** The [Tumblr API](https://api.tumblr.com) is volatile: when fetching sporadically used " - "tags, it may return zero posts, even though older posts exist. To mitigate this, 4CAT decreases " - "the date parameter (before) with six hours and sends the query again. This often " - "successfully returns older, un-fetched posts. If it didn't find new data after 96 retries (24 " - "days), it checks for data up to six years before the last date, decreasing 12 times by 6 months. " - "If that also results in nothing, it assumes the dataset is complete. Check the oldest post in " - "your dataset to see if it this is indeed the case and whether any odd time gaps exists." - } + "type": UserInput.OPTION_INFO, + "help": "**Note:** The [Tumblr API](https://api.tumblr.com) is very volatile. Queries may not return " + "posts, even if posts exists. Waiting for a while and querying again can help, even with identical queries. " + "Consider carrying out multiple queries and using the 'Merge datasets' processor to limit false negatives.\n\n" + "Additionally, older tagged posts may not be returned, even if they exist. To mitigate this, 4CAT decreases " + "the date parameter (before) with six hours and sends the query again. This often " + "successfully returns older, un-fetched posts. If it didn't find new data after 96 retries (24 " + "days), it checks for data up to six years before the last date, decreasing 12 times by 6 months. " + "If that also results in nothing, it assumes the dataset is complete. Check the oldest post in " + "your dataset to see if it this is indeed the case and whether any odd time gaps exists." + } options["daterange"] = { - "type": UserInput.OPTION_DATERANGE, - "help": "Date range" - } + "type": UserInput.OPTION_DATERANGE, + "help": "Date range" + } return options @@ -177,119 +213,244 @@ def get_items(self, query): # ready our parameters parameters = self.dataset.get_parameters() - scope = parameters.get("search_scope", "") - queries = parameters.get("query").split(", ") - fetch_reblogs = parameters.get("fetch_reblogs", False) + queries = re.split(",|\n", parameters.get("query", "")) + get_notes = parameters.get("get_notes", False) + get_reblogs = parameters.get("get_reblogs", False) + reblog_type = parameters.get("reblog_type", False) + reblog_outside_daterange = parameters.get("reblog_outside_daterange", False) # Store all info here results = [] - # Store all notes from posts by blogs here - all_notes = [] + # Blog names and post IDs of extra posts we need to fetch + # (e.g. in the reblog trail or posts that reblog captured posts) + extra_posts = [] # Get date parameters min_date = parameters.get("min_date", None) max_date = parameters.get("max_date", None) + min_date = int(min_date) if min_date else 0 + max_date = int(max_date) if max_date else int(time.time()) - if min_date: - min_date = int(min_date) - if max_date: - max_date = int(max_date) - else: - max_date = int(time.time()) + if not queries: + self.dataset.finish_with_error("No queries given") + return # Connect to Tumblr API try: self.client = self.connect_to_tumblr() - except ConfigException as e: + except ConfigException: self.log.warning(f"Could not connect to Tumblr API: API keys invalid or not set") self.dataset.finish_with_error(f"Could not connect to Tumblr API: API keys invalid or not set") return except ConnectionRefusedError as e: client_info = self.client.info() self.log.warning(f"Could not connect to Tumblr API: {e}; client_info: {client_info}") - self.dataset.finish_with_error(f"Could not connect to Tumblr API: {client_info.get('meta', {}).get('status', '')} - {client_info.get('meta', {}).get('msg', '')}") + self.dataset.finish_with_error( + f"Could not connect to Tumblr API:" + f"{client_info.get('meta', {}).get('status', '')} - {client_info.get('meta', {}).get('msg', '')}") return - # for each tag or blog, get post - for query in queries: + # For each tag or blog, get posts + # with a limit of ten individual tasks. + for query in queries[:10]: - # Get posts per tag - if scope == "tag": - new_results = self.get_posts_by_tag(query, max_date=max_date, min_date=min_date) + query = query.strip() - # Get posts per blog - elif scope == "blog": - new_results, notes = self.get_posts_by_blog(query, max_date=max_date, min_date=min_date) - all_notes.append(notes) - else: - self.dataset.update_status("Invalid scope") - break + post_id = None + + # Format @blogname:id + if query.startswith("@"): + + # Get a possible post ID + blog_name = query[1:] + if ":" in query: + blog_name, post_id = blog_name.split(":") + + new_results = self.get_posts_by_blog(blog_name, post_id=post_id, max_date=max_date, min_date=min_date) + + # Post URL + elif "tumblr.com/" in query: + + try: + # Format https://{blogname}.tumblr.com/post/{post_id} + if "/post/" in query: + blog_name = query.split(".tumblr.com")[0].replace("https://", "").replace("www.", "").strip() + post_id = query.split("/")[-1].strip() + # May also be a slug string. + if not post_id.isdigit(): + post_id = query.split("/")[-2].strip() + + # Format https://tumblr.com/{blogname}/{post_id} + else: + blog_and_id = query.split("tumblr.com/")[-1] + blog_and_id = blog_and_id.replace("blog/view/", "") # Sometimes present in the URL + blog_name, post_id = blog_and_id.split("/") + if not post_id.isdigit(): + post_id = query.split("/")[-2].strip() + + except IndexError: + self.dataset.update_status("Invalid post URL: %s" % query) + continue - results += new_results + new_results = self.get_posts_by_blog(blog_name, post_id=post_id, max_date=max_date, min_date=min_date) + + # Get tagged post + else: + if query.startswith("#"): + query = query[1:] + + # Used for getting tagged posts, which uses requests instead. + api_key = self.parameters.get("consumer_key") + if not api_key: + api_key = SearchTumblr.get_tumblr_keys(self.owner)[0] + + new_results = self.get_posts_by_tag(query, max_date=max_date, min_date=min_date, api_key=api_key) + + results += new_results + + if self.max_posts_reached: + self.dataset.update_status("Max posts exceeded") + break + if self.api_limit_reached: + self.dataset.update_status("API limit reached") + break + + # Check for reblogged posts in the reblog trail; + # we're storing their post IDs and blog names for later, if we're adding reblogs. + if get_reblogs: + for result in results: + # The post rail is stored in the trail list + for trail_post in result.get("trail", []): + # Some posts or blogs have been deleted; skip these + if "broken_blog_name" not in trail_post: + if trail_post["post"]["id"] not in self.seen_ids: + extra_posts.append({"blog": trail_post["blog"]["name"], + "id": trail_post["post"]["id"]}) + + # Get note data. + # Blog-level searches already have some note data, like reblogged text, + # but not everything (like replies), so we're going to retrieve these here as well. + # Also store IDs of reblogs/reblogged posts that we want to add. + + # Create a dictionary with the `reblog_key` as key and notes as value. + # Notes are the same for all posts in a reblog chain. + # This means that we may not have to re-query the same data. + retrieved_notes = {} + + if get_notes: + + for i, post in enumerate(results): if self.max_posts_reached: - self.dataset.update_status("Max posts exceeded") break if self.api_limit_reached: - self.dataset.update_status("API limit reached") break - # If we also want the posts that reblogged the fetched posts: - if fetch_reblogs and not self.max_posts_reached and not self.api_limit_reached: - self.dataset.update_status("Getting notes from all posts") - - # Reblog information is already returned for blog-level searches - if scope == "blog": - text_reblogs = [] - - # Loop through and add the text reblogs that came with the results. - for post_notes in all_notes: - for post_note in post_notes: - for note in post_note: - if note["type"] == "reblog": - text_reblogs.append({note["blog_name"]: note["post_id"]}) - - # Retrieving notes for tag-based posts should be done one-by-one. - # Fetching them all at once is not supported by the Tumblr API. - elif scope == "tag": - # Prepare dicts to pass to `get_post_notes` - posts_to_fetch = {result["author"]: result["id"] for result in results} - - # First extract the notes of each post, and only keep text reblogs - text_reblogs = self.get_post_notes(posts_to_fetch) - - # Get the full data for text reblogs. - if text_reblogs: - connection_retries = 0 - for i, text_reblog in enumerate(text_reblogs): - self.dataset.update_status("Got %i/%i text reblogs" % (i, len(text_reblogs))) - if connection_retries >= 5: - self.dataset.update_status("Multiple connection refused errors; unable to continue collection of reblogs.") - break - for key, value in text_reblog.items(): - if connection_retries >= 5: - break - try: - reblog_post = self.get_post_by_id(key, value) - except ConnectionRefusedError: - connection_retries += 1 - self.failed_reblogs.append(key) - self.dataset.update_status(f"ConnectionRefused: Unable to collect reblogs for post {key}") + self.dataset.update_status("Retrieving notes for post %i/%i" % (i + 1, len(results))) + + # We may have already encountered this note-chain + # with a different post. + if post["reblog_key"] in retrieved_notes: + notes = retrieved_notes[post["reblog_key"]] + + # In the case of posts with just a few notes, + # we may have all the possible notes in the retrieved JSON. + elif "notes" in post and (len(post["notes"]) == post["note_count"]): + # Add some metrics, like done in `get_notes`. + notes = { + "notes": post["notes"], + "reply_count": len([n for n in post["notes"] if n["type"] == "reply"]), + "reblog_count": len([n for n in post["notes"] if n["type"] == "reblog"]), + "like_count": len([n for n in post["notes"] if n["type"] == "like"]) + } + + else: + # Get notes via the API + # Only gets first 1,000 replies or text/tag reblogs. + + # We're using different querying modes since + # it'll speed up the process. The fastest is + # `conversation`, which prioritises text reblogs and + # replies, and also provides metrics on like and reblog counts; + # we'll use this as default. If the user + # has indicated they also want to add reblogs with tags, + # we'll also use the `reblogs_with_tags` mode. + seen_notes = set() + notes = self.get_notes(post["blog_name"], post["id"], mode="conversation", + max_reblogs=self.max_reblogs) + reblog_count = 0 + for note in notes["notes"]: + if note["type"] == "reblog": # Replies don't have IDs + reblog_count += 1 + seen_notes.add(note["post_id"]) + + # Get tag-only reblogs; these aren't returned in `conversation` mode. + if reblog_type == "text_or_tag" and reblog_count <= self.max_reblogs: + tag_notes = self.get_notes(post["blog_name"], post["id"], mode="reblogs_with_tags", + max_reblogs=self.max_reblogs - reblog_count) + for tag_note in tag_notes["notes"]: + if tag_note["post_id"] not in seen_notes: + notes["notes"].append(tag_note) + + # Add to posts + results[i] = {**results[i], **notes} + retrieved_notes[post["reblog_key"]] = notes + + # Identify which notes/reblogs we can collect as new posts + if get_reblogs: + + for note in notes["notes"]: + + # Skip replies and likes + if note["type"] != "reblog": continue - if reblog_post: - reblog_post = self.parse_tumblr_posts([reblog_post], reblog=True) - results.append(reblog_post[0]) + + if note["post_id"] not in self.seen_ids: + + # Potentially skip extra posts outside of the date range + if not reblog_outside_daterange: + if note.get("timestamp"): + if not min_date >= note["timestamp"] >= max_date: + continue + + extra_posts.append({"blog": note["blog_name"], "id": note["post_id"]}) + + # Add reblogged posts and reblogs to dataset + for i, extra_post in enumerate(extra_posts): + + self.dataset.update_status("Adding %s/%s reblogs to the dataset" % (i, len(extra_posts))) + + if extra_post["id"] not in self.seen_ids: + + # Potentially skip new posts outside of the date range + # not always present in the notes data. + if not reblog_outside_daterange and (max_date and min_date): + new_post = self.get_posts_by_blog(extra_post["blog"], extra_post["id"], max_date=max_date, + min_date=min_date) + else: + new_post = self.get_posts_by_blog(extra_post["blog"], extra_post["id"]) + + if new_post: + new_post = new_post[0] + + # Add note data; these are already be retrieved above + if get_notes: + new_post = {**new_post, **retrieved_notes[new_post["reblog_key"]]} + + results.append(new_post) + self.seen_ids.add(extra_post["id"]) self.job.finish() return results - def get_posts_by_tag(self, tag, max_date=None, min_date=None): + def get_posts_by_tag(self, tag, max_date=None, min_date=None, api_key=None): """ - Get Tumblr posts posts with a certain tag - :param tag, str: the tag you want to look for + Get Tumblr posts posts with a certain tag. + :param tag: the tag you want to look for :param min_date: a unix timestamp, indicates posts should be min_date this date. :param max_date: a unix timestamp, indicates posts should be max_date this date. + :param api_key: The api key. :returns: a dict created from the JSON response """ @@ -300,7 +461,7 @@ def get_posts_by_tag(self, tag, max_date=None, min_date=None): retries = 0 date_retries = 0 - # We're gonna change max_date, so store a copy for reference. + # We're going to change max_date, so store a copy for reference. max_date_original = max_date # We use the average time difference between posts to spot possible gaps in the data. @@ -324,29 +485,45 @@ def get_posts_by_tag(self, tag, max_date=None, min_date=None): break try: - # Use the pytumblr library to make the API call - posts = self.client.tagged(tag, before=max_date, limit=20, filter="raw") + # PyTumblr does not allow to use the `npf` parameter yet + # for the `tagged` endpoint (opened a pull request), so + # we're using requests here. + params = { + "tag": tag, + "api_key": api_key, + "before": max_date, + "limit": 20, + "filter": "raw", + "npf": True, + "notes_info": True + } + url = "https://api.tumblr.com/v2/tagged" + response = requests.get(url, params=params) + posts = response.json()["response"] + except ConnectionError: - self.update_status("Encountered a connection error, waiting 10 seconds.") + self.dataset.update_status("Encountered a connection error, waiting 10 seconds") time.sleep(10) retries += 1 continue - # Get rid of posts that we already enountered, + # Skip posts that we already encountered, # preventing Tumblr API shenanigans or double posts because of - # time reductions. Make sure it's no odd error string, though. - unseen_posts = [] - for check_post in posts: - # Sometimes the API repsonds just with "meta", "response", or "errors". - if isinstance(check_post, str): - self.dataset.update_status("Couldn't add post:", check_post) + # time reductions. Make sure it's no error string, though. + new_posts = [] + for post in posts: + # Sometimes the API responds just with "meta", "response", or "errors". + if isinstance(post, str): + self.dataset.update_status("Couldn't add post:", post) retries += 1 break else: retries = 0 - if check_post["id"] not in self.seen_ids: - unseen_posts.append(check_post) - posts = unseen_posts + if post["id"] not in self.seen_ids: + self.seen_ids.add(post["id"]) + new_posts.append(post) + + posts = new_posts # For no clear reason, the Tumblr API sometimes provides posts with a higher timestamp than requested. # So we have to prevent this manually. @@ -361,27 +538,26 @@ def get_posts_by_tag(self, tag, max_date=None, min_date=None): # self.api_limit_reached = True # break - # Make sure the Tumblr API doesn't magically stop at an earlier date + # Make sure the Tumblr API doesn't magically stop even if earlier posts are available if not posts: date_retries += 1 - # We're first gonna check carefully if there's small timegaps by + # We're first going to check carefully if there's small time gaps by # decreasing by six hours. # If that didn't result in any new posts, also dedicate 12 date_retries # with reductions of six months, just to be sure there's no data from # years earlier missing. if date_retries < 96: - max_date -= 21600 # Decrease by six hours - self.dataset.update_status("Collected %s posts for tag %s, but no new posts returned - decreasing time search with 6 hours to %s to make sure this is really it (retry %s/96)" % (str(len(all_posts)), tag, max_date_str, str(date_retries),)) + max_date -= 21600 # Decrease by six hours elif date_retries <= self.max_date_retries: - max_date -= 604800 # Decrease by one week - retry_str = str(date_retries - 96) - self.dataset.update_status("Collected %s posts for tag %s, but no new posts returned - no new posts found with decreasing by 6 hours, decreasing with a week to %s instead (retry %s/150)" % (str(len(all_posts)), tag, max_date_str, str(retry_str),)) + max_date -= 604800 # Decrease by one week + self.dataset.update_status("No new posts found for #%s - looking for posts before %s" % ( + tag, datetime.fromtimestamp(max_date).strftime("%Y-%m-%d %H:%M:%S"))) # We can stop when the max date drops below the min date. - if min_date: + if min_date != 0: if max_date <= min_date: break @@ -390,8 +566,6 @@ def get_posts_by_tag(self, tag, max_date=None, min_date=None): # Append posts to main list else: - posts = self.parse_tumblr_posts(posts) - # Get all timestamps and sort them. post_dates = sorted([post["timestamp"] for post in posts]) @@ -420,9 +594,8 @@ def get_posts_by_tag(self, tag, max_date=None, min_date=None): if len(all_posts) >= 250 and time_dif > (avg_time_dif * 5): time_str = datetime.fromtimestamp(date).strftime("%Y-%m-%d %H:%M:%S") - self.dataset.update_status("Time difference of %s spotted, restarting query at %s" % (str(time_dif), time_str,)) - - self.seen_ids.update([post["id"] for post in posts]) + self.dataset.update_status( + "Time difference of %s spotted, restarting query at %s" % (str(time_dif), time_str,)) posts = [post for post in posts if post["timestamp"] >= date] if posts: all_posts += posts @@ -432,21 +605,20 @@ def get_posts_by_tag(self, tag, max_date=None, min_date=None): time_difs.append(time_dif) - # To start a new query + # Stop if we found nothing for this query if not posts: break # Manually check if we have a lower date than the lowest allowed date already (min date). # This functonality is not natively supported by Tumblr. - if min_date: + if min_date != 0: if max_date < min_date: # Get rid of all the posts that are earlier than the max_date timestamp - posts = [post for post in posts if post["timestamp"] >= min_date and post["timestamp"] <= max_date_original] + posts = [post for post in posts if min_date <= post["timestamp"] <= max_date_original] if posts: all_posts += posts - self.seen_ids.update([post["id"] for post in posts]) break # We got a new post, so we can reset the retry counts. @@ -456,9 +628,6 @@ def get_posts_by_tag(self, tag, max_date=None, min_date=None): # Add retrieved posts top the main list all_posts += posts - # Add to seen ids - self.seen_ids.update([post["id"] for post in posts]) - # Add time differences and calculate new average time difference all_time_difs += time_difs @@ -475,33 +644,40 @@ def get_posts_by_tag(self, tag, max_date=None, min_date=None): self.max_posts_reached = True break - self.dataset.update_status("Collected %s posts for tag %s, now looking for posts before %s" % (str(len(all_posts)), tag, max_date_str,)) + self.dataset.update_status( + "Collected %s posts for #%s, retrieving posts before %s" % (str(len(all_posts)), tag, max_date_str,)) + time.sleep(.2) return all_posts - def get_posts_by_blog(self, blog, max_date=None, min_date=None): + def get_posts_by_blog(self, blog, post_id=None, max_date=None, min_date=None): """ - Get Tumblr posts posts with a certain blog - :param tag, str: the name of the blog you want to look for - :param min_date: a unix timestamp, indicates posts should be min_date this date. - :param max_date: a unix timestamp, indicates posts should be max_date this date. + Get Tumblr posts from a certain blog + :param blog: the name of the blog you want to look for + :param post_id: the post ID (optional) + :param max_date: a unix timestamp, indicates posts should be max_date this date. + :param min_date: a unix timestamp, indicates posts should be min_date this date. - :returns: a dict created from the JSON response + :returns: a dict created from the JSON response """ + blog = blog + ".tumblr.com" + if post_id: + try: + int(post_id) + except TypeError: + raise QueryParametersException("Post ID %s is invalid" % post_id) + if not max_date: max_date = int(time.time()) # Store all posts in here all_posts = [] - # Store notes here, if they exist and are requested - all_notes = [] - # Some retries to make sure the Tumblr API actually returns everything retries = 0 - self.max_retries = 48 # 2 days + self.max_retries = 48 # 2 days # Get Tumblr posts until there's no more left. while True: @@ -515,158 +691,209 @@ def get_posts_by_blog(self, blog, max_date=None, min_date=None): try: # Use the pytumblr library to make the API call - posts = self.client.posts(blog, before=max_date, limit=20, reblog_info=True, notes_info=True, filter="raw") + posts = self.client.posts(blog, id=post_id, before=max_date, limit=20, reblog_info=True, + notes_info=True, filter="raw", npf=True) posts = posts["posts"] - #if (max_date - posts[0]["timestamp"]) > 500000: - #self.dataset.update_status("ALERT - DATES LIKELY SKIPPED") - #self.dataset.update_status([post["timestamp"] for post in posts]) + except ConnectionRefusedError: + retries += 1 + if post_id: + self.failed_posts.append(post_id) + self.dataset.update_status("ConnectionRefused: Unable to collect post %s/%s" % (blog, post_id)) + else: + self.dataset.update_status( + "ConnectionRefused: Unable to collect posts for blog %s before %s" % (blog, max_date)) + time.sleep(10) + continue except Exception as e: - - self.dataset.update_status("Reached the limit of the Tumblr API. Last timestamp: %s" % str(max_date)) + self.dataset.update_status("Couldn't collect posts; likely reached the limit of the Tumblr API (%s)." + "Last timestamp: %s" % (e, str(max_date))) self.api_limit_reached = True break # Make sure the Tumblr API doesn't magically stop at an earlier date if not posts or isinstance(posts, str): retries += 1 - max_date -= 3600 # Decrease by an hour - self.dataset.update_status("No posts returned by Tumblr - checking whether this is really all (retry %s/48)" % str(retries)) + max_date -= 3600 # Decrease by an hour + self.dataset.update_status( + "No posts returned by Tumblr - checking whether this is really all (retry %s/48)" % str(retries)) continue - # Append posts to main list - else: - # Keep the notes, if so indicated - if self.parameters.get("fetch_reblogs"): - for post in posts: - if "notes" in post: - all_notes.append(post["notes"]) - - posts = self.parse_tumblr_posts(posts) - - # Get the lowest date - max_date = sorted([post["timestamp"] for post in posts])[0] - - # Manually check if we have a lower date than the min date (`min_date`) already. - # This functonality is not natively supported by Tumblr. - if min_date: - if max_date < min_date: + # Skip posts that we already encountered, + # preventing Tumblr API shenanigans or double posts because of + # time reductions. Make sure it's no error string, though. + new_posts = [] + for post in posts: + # Sometimes the API reponds just with "meta", "response", or "errors". + if isinstance(post, str): + self.dataset.update_status("Couldn't add post:", post) + retries += 1 + break + else: + retries = 0 + if post["id"] not in self.seen_ids: + self.seen_ids.add(post["id"]) + new_posts.append(post) - # Get rid of all the posts that are earlier than the max_date timestamp - posts = [post for post in posts if post["timestamp"] >= min_date] + # Possibly only keep posts within the date range. + if max_date and min_date: + new_posts = [p for p in new_posts if min_date <= p["timestamp"] <= max_date] - if posts: - all_posts += posts - break + if not new_posts: + break - retries = 0 + # Append posts to main list + all_posts += new_posts - all_posts += posts + # Get the lowest date for next loop + max_date = sorted([post["timestamp"] for post in posts])[0] - #if (max_date - posts[len(posts) - 1]["timestamp"]) > 500000: - #self.dataset.update_status("ALERT - DATES LIKELY SKIPPED") - #self.dataset.update_status([post["timestamp"] for post in posts]) + retries = 0 if len(all_posts) >= self.max_posts: self.max_posts_reached = True break + if post_id: + break - self.dataset.update_status("Collected %s posts" % str(len(all_posts))) + self.dataset.update_status("Collected %s posts for blog %s" % (str(len(all_posts)), blog)) + time.sleep(.2) - return all_posts, all_notes + return all_posts - def get_post_notes(self, di_blogs_ids, only_text_reblogs=True): + def get_notes(self, blog_id, post_id, mode="conversation", max_reblogs=1000) -> dict: """ - Gets the post notes. - :param di_blogs_ids, dict: A dictionary with blog names as keys and post IDs as values. - :param only_text_reblogs, bool: Whether to only keep notes that are text reblogs. + Gets data on the notes of a specific post. + :param blog_id: The ID of the blog. + :param post_id: The ID of the post. + :param mode: The type of notes that get priority. + `conversation` prioritises text reblogs and replies. + :param mode: Maximum amount of notes to return. + :param max_reblogs: The number of reblogs to collect. + + :returns: a dictionaries with notes and note metrics. """ - # List of dict to get reblogs. Items are: [{"blog_name": post_id}] - text_reblogs = [] + post_notes = [] max_date = None # Do some counting - len_blogs = len(di_blogs_ids) count = 0 + # Some posts have tens of thousands of notes + # so we'll cap this at 100 + # Stop trying to fetch the notes after this many retries - max_notes_retries = 10 + max_reblogs_retries = 10 notes_retries = 0 - for key, value in di_blogs_ids.items(): - - count += 1 + first_batch = True + note_metrics = {} - if self.interrupted: - raise ProcessorInterruptedException("Interrupted while fetching post notes from Tumblr") + stop_collecting = False - # First, get the blog names and post_ids from reblogs - # Keep digging till there's nothing left, or if we can fetch no new notes - while True: + # For status updates + note_type = "" + if mode == "conversation": + note_type = "reblogs with text" + elif mode == "reblogs_with_tags": + note_type = "reblogs with tags" - # Requests a post's notes - notes = self.client.notes(key, id=value, before_timestamp=max_date) + if self.interrupted: + raise ProcessorInterruptedException("Interrupted while fetching post notes from Tumblr") - if only_text_reblogs: + while True: - if "notes" in notes: - notes_retries = 0 + if notes_retries >= max_reblogs_retries: + self.dataset.update_status("Too many connection errors; unable to collect notes for post %s" % post_id) + self.failed_posts.append(post_id) + break - for note in notes["notes"]: - # If it's a reblog, extract the data and save the rest of the posts for later - if note["type"] == "reblog": - if note.get("added_text"): - text_reblogs.append({note["blog_name"]: note["post_id"]}) + # Request a post's notes + try: - if notes.get("_links"): - max_date = notes["_links"]["next"]["query_params"]["before_timestamp"] + # Important: we're getting notes in 'conversation' mode to + # prioritise replies and reblogs that add text. + # We're not interested in the names of authors that liked the post + # or who reblogged without adding content. + notes = self.client.notes(blog_id, id=post_id, before_timestamp=max_date, mode=mode) + except ConnectionRefusedError: + self.dataset.update_status( + "Couldn't get notes for post %s (ConnectionRefusedError), trying again" % post_id) + notes_retries += 1 + time.sleep(10) + continue - # If there's no `_links` key, that's all. - else: - break + except Exception as e: + # Stop with unknown errors + self.dataset.update_status("Couldn't get notes for post %s. Unknown error: %s" % (post_id, e)) + notes_retries += 1 + break - # If there's no "notes" key in the returned dict, something might be up - else: - self.dataset.update_status("Couldn't get notes for Tumblr request " + str(notes)) - notes_retries += 1 - pass + if "notes" in notes: + + notes_retries = 0 + + # Add some metrics for the first response + # These metrics are only returned in conversation mode. + if first_batch and mode == "conversation": + note_metrics = { + "note_count": notes["total_notes"], + "reblog_count": notes.get("total_reblogs", 0), + "like_count": notes.get("total_likes", 0), + "reply_count": 0 + } + first_batch = False + + # Add notes + for note in notes["notes"]: + + # Only count reblogs with added content (text or hashtags) + # towards the total count; replies are never too substantial, + # so we always collect them all. + if mode == "conversation" and note["type"] == "reply": + note_metrics["reply_count"] += 1 + elif mode == "conversation": + count += 1 + elif mode == "reblogs_with_tags": + # Skip notes without added tags + if not note.get("tags"): + continue + count += 1 - if notes_retries > max_notes_retries: - self.failed_notes.append(key) - break + post_notes.append(note) - self.dataset.update_status("Identified %i text reblogs in %i/%i notes" % (len(text_reblogs), count, len_blogs)) + if count >= max_reblogs: + post_notes = post_notes[:count + note_metrics.get("reply_count", 0)] + stop_collecting = True - return text_reblogs + if stop_collecting: + break - def get_post_by_id(self, blog_name, post_id): - """ - Fetch individual posts - :param blog_name, str: The blog's name - :param id, int: The post ID + if notes.get("_links"): + max_date = notes["_links"]["next"]["query_params"]["before_timestamp"] - returns result list, a list with a dictionary with the post's information - """ - if self.interrupted: - raise ProcessorInterruptedException("Interrupted while fetching post from Tumblr") + self.dataset.update_status("Collected %s %s for @%s:%s" % (count, note_type, blog_id, post_id)) + time.sleep(.2) - # Request the specific post. - post = self.client.posts(blog_name, id=post_id) + # If there's no `_links` key, that's all. + else: + break - # Tumblr API can sometimes return with this kind of error: - # {'meta': {'status': 500, 'msg': 'Server Error'}, 'response': {'error': 'Malformed JSON or HTML was returned.'}} - if "posts" not in post: - return None + # If there's no "notes" key in the returned dict, something might be up + else: + notes_retries += 1 + time.sleep(1) + continue - # Get the first element of the list - it's always one post. - result = post["posts"][0] + # Merge notes and note metrics + post_notes = {"notes": post_notes, **note_metrics} - return result + return post_notes @staticmethod - def get_tumbler_keys(user): + def get_tumblr_keys(user): config_keys = [ config.get("api.tumblr.consumer_key", user=user), config.get("api.tumblr.consumer_secret", user=user), @@ -683,16 +910,19 @@ def connect_to_tumblr(self): """ # User input keys config_keys = [self.parameters.get("consumer_key"), - self.parameters.get("consumer_secret"), - self.parameters.get("key"), - self.parameters.get("secret_key")] + self.parameters.get("consumer_secret"), + self.parameters.get("key"), + self.parameters.get("secret_key")] if not all(config_keys): # No user input keys; attempt to use 4CAT config keys - config_keys = self.get_tumbler_keys(self.owner) + config_keys = self.get_tumblr_keys(self.owner) self.client = pytumblr.TumblrRestClient(*config_keys) - client_info = self.client.info() + try: + client_info = self.client.info() + except Exception as e: + raise ConnectionRefusedError("Couldn't connect to Tumblr API, (%s)" % e) # Check if there's any errors if client_info.get("meta"): @@ -713,12 +943,13 @@ def validate_query(query, request, user): :param User user: User object of user who has submitted the query :return dict: Safe query parameters """ + # no query 4 u if not query.get("query", "").strip(): raise QueryParametersException("You must provide a search query.") # reformat queries to be a comma-separated list - items = query.get("query").replace("#","") + items = query.get("query").replace("#", "") items = items.split("\n") # Not more than 10 plox @@ -730,7 +961,7 @@ def validate_query(query, request, user): raise QueryParametersException("Search query cannot be empty.") # So it shows nicely in the frontend. - items = ", ".join([item.lstrip().rstrip() for item in items if item]) + items = ", ".join([item.strip() for item in items if item]) # the dates need to make sense as a range to search within query["min_date"], query["max_date"] = query.get("daterange") @@ -740,120 +971,269 @@ def validate_query(query, request, user): del query["daterange"] query["query"] = items - query["board"] = query.get("search_scope") + "s" # used in web interface # if we made it this far, the query can be executed return query - def parse_tumblr_posts(self, posts, reblog=False): + @staticmethod + def map_item(post): """ - Function to parse Tumblr posts into the same dict items. + Parse Tumblr posts. Tumblr posts can be many different types, so some data processing is necessary. - :param posts, list: List of Tumblr posts as returned form the Tumblr API. - :param reblog, bool: Whether the post concerns a reblog of posts from the original dataset. - - returns list processed_posts, a list with dictionary items of post info. + :param post: Tumblr post, as returned by the Tumblr API. + + :return dict: Mapped item """ - # Store processed posts here - processed_posts = [] - - media_tags = ["photo", "video", "audio"] - - # Loop through all the posts and write a row for each of them. - for post in posts: - post_type = post["type"] + image_urls = [] + image_urls_reblogged = [] + video_urls = [] + video_thumb_urls = [] + audio_urls = [] + audio_artists = [] + link_urls = [] + link_titles = [] + link_descriptions = [] + question = "" + answers = "" + raw_text = [] + formatted_text = [] + body_reblogged = [] + reblog_trail = [] + body_ask = [] + author_ask = "" + authors_replied = [] + replies = [] + unknown_blocks = [] + + # Sometimes the content order is reshuffled in the `layout` property, + # so we have to follow this. + content_order = [] + blocks = [] + if post.get("layout"): + if "type" in post["layout"][0]: + if post["layout"][0]["type"] == "rows": + for display in post["layout"][0].get("display", []): + content_order.append(display["blocks"][0]) + if not content_order: + content_order = range(len(post["content"])) + + # Some text blocks are 'ask' blocks + ask_blocks = [] + for layout_block in post.get("layout", []): + if layout_block["type"] == "ask": + ask_blocks += layout_block["blocks"] + author_ask = layout_block["attribution"]["blog"]["name"] + + # We're getting info as Neue Post Format types, + # so we need to loop through and join some content 'blocks'. + for i in content_order: + + block = post["content"][i] + block_type = block["type"] + + # Image + if block_type == "image": + image_urls.append(block["media"][0]["url"]) + # Audio file + elif block_type == "audio": + audio_urls.append(block["url"] if "url" in block else block["media"]["url"]) + audio_artists.append(block.get("artist", "")) + # Video (embedded or hosted) + elif block_type == "video": + if "media" in block: + video_urls.append(block["media"]["url"]) + elif "url" in block: + video_urls.append(block["url"]) + if "filmstrip" in block: + video_thumb_urls.append(block["filmstrip"]["url"]) + elif "poster" in block: + video_thumb_urls.append(block["poster"][0]["url"]) + else: + video_thumb_urls.append("") + + # Embedded link + elif block_type == "link": + link_urls.append(block["url"]) + if "title" in block: + link_titles.append(block["title"]) + if "description" in block: + link_descriptions.append(block["description"]) + # Poll + elif block_type == "poll": + # Only one poll can be added per post + question = block["question"] + answers = ",".join([a["answer_text"] for a in block["answers"]]) + + # Text; we're adding Markdown formatting. + elif block_type == "text": + + md_text = SearchTumblr.format_tumblr_text(block) + + # If it's an ask text, we're storing it in + # a different column + if i in ask_blocks: + block_type = "ask" + body_ask.append(block["text"]) + else: + raw_text.append(block["text"]) + formatted_text.append(md_text) - # The post's text is in different keys depending on the post type - if post_type in media_tags: - text = post["caption"] - elif post_type == "link": - text = post["description"] - elif post_type == "text" or post_type == "chat": - text = post["body"] - elif post_type == "answer": - text = post["question"] + "\n" + post["answer"] + # Unknown block; can be a third-party app else: - text = "" + unknown_blocks.append(json.dumps(block)) - # Different options for video types (YouTube- or Tumblr-hosted) - if post_type == "video": + blocks.append(block_type) - video_source = post["video_type"] - # Use `get` since some videos are deleted - video_url = post.get("permalink_url") + # Parse some note + for note in post.get("notes", []): + if note["type"] == "reply": + authors_replied.insert(0, note["blog_name"]) + replies.insert(0, note["reply_text"]) - if video_source == "youtube": - # There's no URL if the YouTube video is deleted - if video_url: - video_id = post["video"]["youtube"]["video_id"] - else: - video_id = "deleted" - else: - video_id = "unknown" + # The API sometimes gives back a 'trail' of reblogged content + # This includes reblogged content, but it's not entirely complete (e.g. no tags) + # so we'll only store the original blog name and its text + image content. + for i, reblog in enumerate(post.get("trail", [])): + + reblogged_text = [] + if "broken_blog_name" in reblog: + reblog_author = reblog["broken_blog_name"] else: - video_source = None - video_id = None - video_url = None - - # All the fields to write - processed_post = { - # General columns - "type": post_type, - "timestamp": post["timestamp"], - "is_reblog": reblog, - - # Blog columns - "author": post["blog_name"], - "subject": post["blog"]["title"], - "blog_description": post["blog"]["description"], - "blog_url": post["blog"]["url"], - "blog_uuid": post["blog"]["uuid"], - "blog_last_updated": post["blog"]["updated"], - - # Post columns - "id": post["id"], - "post_url": post["post_url"], - "post_slug": post["slug"], - "thread_id": post["reblog_key"], - "body": text.replace("\x00", ""), - "tags": ", ".join(post["tags"]) if post.get("tags") else None, - "notes": post["note_count"], - "urls": post.get("link_url"), - "images": ",".join([photo["original_size"]["url"] for photo in post["photos"]]) if post.get("photos") else None, - - # Optional video columns - "video_source": video_source if post_type == "video" else None, - "video_url": video_url if post_type == "video" else None, - "video_id": video_id if post_type == "video" else None, - "video_thumb": post.get("thumbnail_url"), # Can be deleted - - # Optional audio columns - "audio_type": post.get("audio_type"), - "audio_url": post.get("audio_source_url"), - "audio_plays": post.get("plays"), - - # Optional link columns - "link_author": post.get("link_author"), - "link_publisher": post.get("publisher"), - "link_image": post.get("link_image"), - - # Optional answers columns - "asking_name": post.get("asking_name"), - "asking_url": post.get("asking_url"), - "question": post.get("question"), - "answer": post.get("answer"), - - # Optional chat columns - "chat": post.get("dialogue") - } + reblog_author = reblog["blog"]["name"] + + for reblog_block in reblog.get("content", []): + if reblog_block["type"] == "text": + reblogged_text.append(reblog_block["text"]) + if reblog_block["type"] == "image": + image_urls_reblogged.append(reblog_block["media"][0]["url"]) + + if not reblogged_text: + reblogged_text = "" + body_reblogged.append("\n".join(reblogged_text)) + + reblog_trail.append(reblog_author) + + return MappedItem({ + "type": post["original_type"] if "original_type" in post else post["type"], + "id": post["id"] if "id" in post else post["post"]["id"], + "author": post["blog_name"], + "author_avatar_url": "https://api.tumblr.com/v2/blog/" + post["blog_name"] + "/avatar", + "thread_id": post["reblog_key"], + "timestamp": datetime.fromtimestamp(post["timestamp"]).strftime("%Y-%m-%d %H:%M:%S"), + "unix_timestamp": post["timestamp"], + "author_subject": post["blog"]["title"], + "author_description": strip_tags(post["blog"]["description"]), + "author_url": post["blog"]["url"], + "author_uuid": post["blog"]["uuid"], + "author_last_updated": post["blog"]["updated"], + "post_url": post["post_url"], + "post_slug": post["slug"], + "is_reblog": True if post.get("parent_post_url") else "", + "reblog_key": post["reblog_key"], + "body": "\n".join(raw_text), + "body_markdown": "\n".join(formatted_text), + "body_reblogged": "\n\n".join(body_reblogged), + "reblog_trail": ",".join(reblog_trail), + "parent_post_author": post.get("reblogged_from_name", ""), + "parent_post_url": post.get("parent_post_url", ""), + "body_ask": "\n".join(body_ask), + "author_ask": author_ask, + "content_order": ",".join(blocks), + "tags": ",".join(post.get("tags", "")), + "note_count": post["note_count"], + "reblog_count": post.get("reblog_count", ""), + "like_count": post.get("like_count", ""), + "reply_count": post.get("reply_count", ""), + "authors_replied": ",".join(authors_replied), + "replies": "\n\n".join(replies), + "link_urls": ",".join(link_urls), + "link_titles": "\n".join(link_titles), + "link_descriptions": "\n".join(link_descriptions), + "image_urls": ",".join(image_urls), + "image_urls_reblogged": ",".join(image_urls_reblogged), + "video_urls": ",".join(video_urls), + "video_thumb_urls": ",".join(video_thumb_urls), + "audio_urls": ",".join(audio_urls), + "audio_artist": ",".join(audio_artists), + "poll_question": question, + "poll_answers": answers, + "unknown_blocks": "\n".join(unknown_blocks) + }) - # Store the processed post - processed_posts.append(processed_post) + @staticmethod + def format_tumblr_text(text_content): + """ + Format text content according to Tumblr's Neue Post Format definition. + Returns text as mardkown. + + :param text_content: A list of `content` as returned by the Tumblr API (can also be part of a `trail`). + :returns dict + + """ - return processed_posts + text = text_content["text"] + + if text_content.get("formatting"): + + # Dict with index numbers as keys where inserts need to be made, + # and the replacement strings as values. Done this way so we know + # when multiple formatting operations need to be made at the same + # index position. + insert_indexes = set() + inserts = {} + + for fmt in text_content["formatting"]: + fmt_type = fmt["type"] + if fmt["type"] in ("link", "bold", "italic"): + s = fmt["start"] + e = fmt["end"] + + opening = True # To know if styles need to be appended or prepended + for n in [s, e]: + insert_indexes.add(n) + n = str(n) + if n not in inserts: + inserts[n] = "" + if fmt_type == "link" and opening: + inserts[n] = inserts[n] + "[" + elif fmt_type == "link" and not opening: + inserts[n] = "](" + fmt["url"] + ")" + inserts[n] + elif fmt_type == "italic": + inserts[n] = "*" + inserts[n] if opening else inserts[n] + "*" + elif fmt_type == "bold": + inserts[n] = "**" + inserts[n] if opening else inserts[n] + "**" + opening = False + + # Change text + if inserts: + extra_chars = 0 + for n, insert in inserts.items(): + n = int(n) + extra_chars + text = text[:n] + insert + text[n:] + extra_chars += len(insert) + + # Some more 'subtype' formatting + subtype = text_content.get("subtype") + ordered_list_count = 1 + if subtype: + if subtype == "unordered-list-item": + text = "- " + text + if subtype == "ordered-list-item": + text = str(ordered_list_count) + ". " + text + ordered_list_count += 1 + elif subtype == "heading1": + text = "#" + text + elif subtype == "heading2": + text = "##" + text + elif subtype == "quote": + text = ">" + text + elif subtype == "indented": + text = " " + text + + return text def after_process(self): """ @@ -866,8 +1246,9 @@ def after_process(self): errors = [] if len(self.failed_notes) > 0: errors.append("API error(s) when fetching notes %s" % ", ".join(self.failed_notes)) - if len(self.failed_reblogs) > 0: - errors.append("API error(s) when fetching reblogs %s" % ", ".join(self.failed_reblogs)) + if len(self.failed_posts) > 0: + errors.append("API error(s) when fetching reblogs %s" % ", ".join(self.failed_posts)) if errors: self.dataset.log(";\n ".join(errors)) - self.dataset.update_status(f"Dataset completed but failed to capture some notes/reblogs; see log for details.") + self.dataset.update_status( + f"Dataset completed but failed to capture some notes/reblogs; see log for details") diff --git a/datasources/twitter-import/explorer/twitter-import-explorer.json b/datasources/twitter-import/explorer/twitter-import-explorer.json deleted file mode 100644 index 2836cd563..000000000 --- a/datasources/twitter-import/explorer/twitter-import-explorer.json +++ /dev/null @@ -1,140 +0,0 @@ -{ - "ndjson": { - "id": "{{ rest_id }}", - "author_picture": "", - "author": "{{ core.user_results.result.legacy.name }}", - "created": "{{ core.user_results.result.legacy.created_at }}", - "body": "{{ legacy.full_text }}", - "likes": "{{ legacy.favorite_count }} ", - "retweets": "{{ legacy.retweet_count }} ", - "replies": "{{ legacy.reply_count }} ", - "external_url": "https://twitter.com/{{ core.user_results.result.rest_id }}/status/{{ rest_id }}", - "image": "{{ legacy.extended_entities.media.media_url_https }}", - "sort_options": [ - { - "key": "core.user_results.result.legacy.created_at", - "label": "Old to new" - }, - { - "key": "core.user_results.result.legacy.created_at", - "label": "New to old", - "descending": true - }, - { - "key": "rest_id", - "label": "Post id", - "force_int": true - }, - { - "key": "core.legacy.favorite_count", - "label": "Most likes", - "descending": true, - "force_int": true - }, - { - "key": "core.legacy.retweet_count", - "label": "Most retweets", - "descending": true, - "force_int": true - }, - { - "key": "core.legacy.reply_count", - "label": "Most replies", - "descending": true, - "force_int": true - } - ] - }, - "csv": { - "author_picture": "", - "author": "{{ author }}", - "likes": "{{ like_count }} ", - "retweets": "{{ retweet_count }} ", - "replies": "{{ reply_count }} ", - "external_url": "https://twitter.com/{{ author }}/status/{{ id }}", - "images": "{{ images }}", - "body": "{{ body }}", - "sort_options": [ - { - "key": "unix_timestamp", - "label": "Old to new", - "force_int": true - }, - { - "key": "unix_timestamp", - "label": "New to old", - "descending": true, - "force_int": true - }, - { - "key": "id", - "label": "Post id", - "force_int": true, - "force_int": true - }, - { - "key": "like_count", - "label": "Most likes", - "descending": true, - "force_int": true - }, - { - "key": "retweet_count", - "label": "Most retweets", - "descending": true, - "force_int": true - }, - { - "key": "reply_count", - "label": "Most replies", - "descending": true, - "force_int": true - } - ] - }, - "ndjson_old_api": { - "author_picture": "", - "author": "{{ author_user.name }}", - "created": "{{ created_at }}", - "likes": "{{ public_metrics.like_count }} ", - "retweets": "{{ public_metrics.retweet_count }} ", - "replies": "{{ public_metrics.reply_count }} ", - "external_url": "https://twitter.com/{{ author_user.username }}/status/{{ id }}", - "image": "{{ attachments.media_keys.url }}", - "body": "{{ text }}", - "sort_options": [ - { - "key": "created_at", - "label": "Old to new" - }, - { - "key": "created_at", - "label": "New to old", - "descending": true - }, - { - "key": "id", - "label": "Post id", - "force_int": true - }, - { - "key": "public_metrics.like_count", - "label": "Most likes", - "descending": true, - "force_int": true - }, - { - "key": "public_metrics.retweet_count", - "label": "Most retweets", - "descending": true, - "force_int": true - }, - { - "key": "public_metrics.reply_count", - "label": "Most replies", - "descending": true, - "force_int": true - } - ] - } -} \ No newline at end of file diff --git a/datasources/twitter-import/search_twitter.py b/datasources/twitter-import/search_twitter.py index 9acb2b45c..8059b6b41 100644 --- a/datasources/twitter-import/search_twitter.py +++ b/datasources/twitter-import/search_twitter.py @@ -13,7 +13,7 @@ class SearchTwitterViaZeeschuimer(Search): """ - Import scraped Imgur data + Import scraped Twitter data """ type = "twitter-import" # job ID category = "Search" # category @@ -28,12 +28,12 @@ class SearchTwitterViaZeeschuimer(Search): "[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)", "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)" ] - + def get_items(self, query): """ Run custom search - Not available for Imgur + Not available for Twitter """ raise NotImplementedError("Twitter datasets can only be created by importing data from elsewhere") @@ -66,6 +66,7 @@ def map_item_modern(tweet): tweet["legacy"]["full_text"] = t_text quote_tweet = tweet.get("quoted_status_result") + if quote_tweet and "tweet" in quote_tweet.get("result", {}): # sometimes this is one level deeper, sometimes not... quote_tweet["result"] = quote_tweet["result"]["tweet"] @@ -82,6 +83,7 @@ def map_item_modern(tweet): "author_id": tweet["legacy"]["user_id_str"], "author_avatar_url": tweet["core"]["user_results"]["result"]["legacy"]["profile_image_url_https"], "author_banner_url": tweet["core"]["user_results"]["result"]["legacy"].get("profile_banner_url", ""), # key does not exist when author does not have a banner + "verified": tweet["core"]["user_results"]["result"].get("is_blue_verified", ""), "source": strip_tags(tweet["source"]), "language_guess": tweet["legacy"].get("lang"), "possibly_sensitive": "yes" if tweet.get("possibly_sensitive") else "no", @@ -93,10 +95,14 @@ def map_item_modern(tweet): "is_retweet": "yes" if retweet else "no", "retweeted_user": retweet["result"]["core"]["user_results"]["result"].get("legacy", {}).get("screen_name", "") if retweet else "", "is_quote_tweet": "yes" if quote_tweet else "no", - "quoted_user": quote_tweet["result"]["core"]["user_results"]["result"].get("legacy", {}).get("screen_name", "") if (quote_tweet and "tombstone" not in quote_tweet["result"]) else "", + "quote_tweet_id": quote_tweet["result"].get("rest_id") if quote_tweet else "", + "quote_author": quote_tweet["result"]["core"]["user_results"]["result"].get("legacy", {}).get("screen_name", "") if (quote_tweet and "tombstone" not in quote_tweet["result"]) else "", + "quote_body": quote_tweet["result"]["legacy"].get("full_text","") if quote_tweet else "", + "quote_images": ",".join([media["media_url_https"] for media in quote_tweet["result"]["legacy"].get("entities", {}).get("media", []) if media["type"] == "photo"]) if quote_tweet else "", + "quote_videos": ",".join([media["media_url_https"] for media in quote_tweet["result"]["legacy"].get("entities", {}).get("media", []) if media["type"] == "video"]) if quote_tweet else "", "is_quote_withheld": "yes" if (quote_tweet and "tombstone" in quote_tweet["result"]) else "no", "is_reply": "yes" if str(tweet["legacy"]["conversation_id_str"]) != str(tweet["rest_id"]) else "no", - "replied_user": tweet["legacy"].get("in_reply_to_screen_name", ""), + "replied_author": tweet["legacy"].get("in_reply_to_screen_name", ""), "is_withheld": "yes" if withheld else "no", "hashtags": ",".join([hashtag["text"] for hashtag in tweet["legacy"]["entities"]["hashtags"]]), "urls": ",".join([url.get("expanded_url", url["display_url"]) for url in tweet["legacy"]["entities"]["urls"]]), @@ -154,9 +160,9 @@ def map_item_legacy(tweet): "is_retweet": "yes" if retweet else "no", "retweeted_user": retweet["result"]["core"]["user_results"]["result"].get("legacy", {}).get("screen_name", "") if retweet else "", "is_quote_tweet": "yes" if quote_tweet else "no", - "quoted_user": quote_tweet["result"]["core"]["user_results"]["result"].get("legacy", {}).get("screen_name", "") if quote_tweet else "", + "quote_author": quote_tweet["result"]["core"]["user_results"]["result"].get("legacy", {}).get("screen_name", "") if quote_tweet else "", "is_reply": "yes" if str(tweet["legacy"]["conversation_id_str"]) != tweet_id else "no", - "replied_user": tweet["legacy"].get("in_reply_to_screen_name", "") if tweet["legacy"].get( + "replied_author": tweet["legacy"].get("in_reply_to_screen_name", "") if tweet["legacy"].get( "in_reply_to_screen_name") else "", "is_withheld": "yes" if withheld else "no", "hashtags": ",".join([hashtag["text"] for hashtag in tweet["legacy"]["entities"]["hashtags"]]), diff --git a/datasources/twitterv2/explorer/twitterv2-explorer.json b/datasources/twitterv2/explorer/twitterv2-explorer.json deleted file mode 100644 index c9fb03090..000000000 --- a/datasources/twitterv2/explorer/twitterv2-explorer.json +++ /dev/null @@ -1,94 +0,0 @@ -{ - "ndjson": { - "author_picture": "", - "author": "{{ author_user.name }}", - "created": "{{ created_at }}", - "likes": "{{ public_metrics.like_count }} ", - "retweets": "{{ public_metrics.retweet_count }} ", - "replies": "{{ public_metrics.reply_count }} ", - "external_url": "https://twitter.com/{{ author_user.username }}/status/{{ id }}", - "image": "{{ attachments.media_keys.url }}", - "body": "{{ text }}", - "sort_options": [ - { - "key": "created_at", - "label": "Old to new" - }, - { - "key": "created_at", - "label": "New to old", - "descending": true - }, - { - "key": "id", - "label": "Post id", - "force_int": true - }, - { - "key": "public_metrics.like_count", - "label": "Most likes", - "descending": true, - "force_int": true - }, - { - "key": "public_metrics.retweet_count", - "label": "Most retweets", - "descending": true, - "force_int": true - }, - { - "key": "public_metrics.reply_count", - "label": "Most replies", - "descending": true, - "force_int": true - } - ] - }, - "csv": { - "author_picture": "", - "author": "{{ author }}", - "likes": "{{ like_count }} ", - "retweets": "{{ retweet_count }} ", - "replies": "{{ reply_count }} ", - "external_url": "https://twitter.com/{{ author }}/status/{{ id }}", - "images": "{{ images }}", - "body": "{{ body }}", - "sort_options": [ - { - "key": "unix_timestamp", - "label": "Old to new", - "force_int": true - }, - { - "key": "unix_timestamp", - "label": "New to old", - "descending": true, - "force_int": true - }, - { - "key": "id", - "label": "Post id", - "force_int": true, - "force_int": true - }, - { - "key": "like_count", - "label": "Most likes", - "descending": true, - "force_int": true - }, - { - "key": "retweet_count", - "label": "Most retweets", - "descending": true, - "force_int": true - }, - { - "key": "reply_count", - "label": "Most replies", - "descending": true, - "force_int": true - } - ] - } -} \ No newline at end of file diff --git a/datasources/vk/search_vk.py b/datasources/vk/search_vk.py index d04daba0a..22c5581a9 100644 --- a/datasources/vk/search_vk.py +++ b/datasources/vk/search_vk.py @@ -30,7 +30,7 @@ class SearchVK(Search): "[VK API documentation](https://vk.com/dev/first_guide)", "[Python API wrapper](https://github.com/python273/vk_api)" ] - + expanded_profile_fields = "id,screen_name,first_name,last_name,name,deactivated,is_closed,is_admin,sex,city,country,photo_200,photo_100,photo_50,followers_count,members_count" # https://vk.com/dev/objects/user & https://vk.com/dev/objects/group @classmethod diff --git a/helper-scripts/migrate/migrate-1.46-1.47.py b/helper-scripts/migrate/migrate-1.46-1.47.py new file mode 100644 index 000000000..2b764abd7 --- /dev/null +++ b/helper-scripts/migrate/migrate-1.46-1.47.py @@ -0,0 +1,193 @@ +# Update the 'annotations' table so every annotation has its own row. +# also add extra data +import sys +import os +import json + +from pathlib import Path + +sys.path.insert(0, os.path.join(os.path.abspath(os.path.dirname(__file__)), "../..")) +from common.lib.database import Database +from common.lib.logger import Logger + +log = Logger(output=True) + +import configparser + +ini = configparser.ConfigParser() +ini.read(Path(__file__).parent.parent.parent.resolve().joinpath("config/config.ini")) +db_config = ini["DATABASE"] + +db = Database(logger=log, dbname=db_config["db_name"], user=db_config["db_user"], password=db_config["db_password"], + host=db_config["db_host"], port=db_config["db_port"], appname="4cat-migrate") + + +datasets = db.fetchall("SELECT * FROM datasets WHERE annotation_fields != ''") + +print(" Converting annotation options from lists to dicts...") +for dataset in datasets: + + annotation_fields = dataset["annotation_fields"] + + # Flatten options from list of dicts to dict + options_converted = False + annotation_fields = json.loads(annotation_fields) + new_annotation_fields = annotation_fields + + for field_id, annotation_field in annotation_fields.items(): + + if "options" in annotation_field: + flattened_options = {} + + if isinstance(annotation_field["options"], list): + for op in annotation_field["options"]: + flattened_options.update(op) + new_annotation_fields[field_id]["options"] = flattened_options + options_converted = True + + if options_converted: + print(" Converting annotation options to list for dataset %s..." % dataset["key"]) + db.execute("UPDATE datasets SET annotation_fields = %s WHERE key = %s;", (json.dumps(new_annotation_fields), dataset["key"])) + +print(" Expanding the 'annotations' table.") + +print(" Creating new annotations table...") +db.execute(""" +CREATE TABLE IF NOT EXISTS annotations_new ( + id SERIAL PRIMARY KEY, + dataset TEXT, + field_id TEXT, + item_id TEXT, + timestamp INT DEFAULT 0, + timestamp_created INT DEFAULT 0, + label TEXT, + type TEXT, + options TEXT, + value TEXT, + author TEXT, + author_original TEXT, + by_processor BOOLEAN DEFAULT FALSE, + metadata TEXT +); +""") + +print(" Creating indexes for annotations table...") +db.execute(""" +CREATE UNIQUE INDEX IF NOT EXISTS annotation_id + ON annotations_new ( + id +); +CREATE UNIQUE INDEX IF NOT EXISTS annotation_unique + ON annotations_new ( + label, + dataset, + item_id +); +CREATE INDEX IF NOT EXISTS annotation_value + ON annotations_new ( + value +); +CREATE INDEX IF NOT EXISTS annotation_timestamp + ON annotations_new ( + timestamp +); +""") + +print(" Transferring old annotations to new annotations table...") + +annotations = db.fetchall("SELECT * FROM annotations;") + +if not annotations: + print(" No annotation fields to transfer, skipping...") + +elif "key" not in annotations[0] and "dataset" in annotations[0]: + print(" Annotations table seems to have been updated already") + +else: + + count = 0 + skipped_count = 0 + + columns = "id,dataset,field_id,item_id,timestamp,timestamp_created,label,type,options,value,author,author_original,by_processor,metadata" + + # Each row are **all** annotations per dataset + for row in annotations: + + dataset = db.fetchone("SELECT * FROM datasets WHERE key = '" + row["key"] + "';") + # If the dataset is not present anymore, + # we're going to skip these annotations; + # likely the dataset is expired. + if not dataset: + print(" No dataset found for key %s, skipping..." % row["key"]) + skipped_count += 1 + continue + + annotation_fields = dataset["annotation_fields"] + if annotation_fields: + annotation_fields = json.loads(dataset.get("annotation_fields")) + else: annotation_fields = {} + + author = dataset.get("creator", "") + + if not row.get("annotations"): + print(" No annotations for dataset %s, skipping..." % row["key"]) + skipped_count += 1 + continue + + # Loop through all annotated posts + for post_id, post_annotations in json.loads(row["annotations"]).items(): + + # Loop through individual annotations per post + for label, value in post_annotations.items(): + + # Get the ID of this particular annotation field + field_id = [k for k, v in annotation_fields.items() if v["label"] == label] + + if field_id: + field_id = field_id[0] + + # Skip if this field was not saved to the datasets table + if not field_id or field_id not in annotation_fields: + print(" Annotation field ID not saved to datasets table, skipping...") + skipped_count += 1 + continue + + ann_type = annotation_fields[field_id]["type"] + options = annotation_fields[field_id]["options"] if "options" in annotation_fields[field_id] else "" + options = {k: v for d in options for k, v in d.items()} # flatten + + if isinstance(value, list): + value = ",".join(value) + + inserts = [( + row["dataset"], # dataset + int(field_id), # field_id; this is an ID for the same type of input field. + str(post_id), # post_id; needs to be a string, changes per data source. + dataset["timestamp"], # timestamp + dataset["timestamp"], # timestamp_created + label, # label + ann_type, # type + json.dumps(options) if options else "", # options; each option has a key and a value. + value, # value + author, # author + author, # author_original + False, # by_processor + json.dumps({}), # metadata + )] + + db.execute("INSERT INTO annotations_new (" + columns + ") VALUES %s", replacements=inserts) + + count += 1 + + if count % 10 == 0: + print(" Transferred %s annotations..." % count) + + print(" Done, transferred %s annotations and skipped %s annotations" % (count, skipped_count)) + +print(" Deleting old annotations table...") +db.execute("DROP TABLE annotations") + +print(" Renaming new annotations table...") +db.execute("ALTER TABLE annotations_new RENAME TO annotations;") + +print(" - done!") \ No newline at end of file diff --git a/processors/conversion/csv_to_excel.py b/processors/conversion/csv_to_excel.py index fe8139748..a571f287a 100644 --- a/processors/conversion/csv_to_excel.py +++ b/processors/conversion/csv_to_excel.py @@ -58,7 +58,7 @@ def process(self): ) # recreate CSV file with the new dialect - with self.dataset.get_results_path().open("w") as output: + with self.dataset.get_results_path().open("w", encoding="utf-8") as output: fieldnames = self.source_dataset.get_item_keys(self) writer = csv.DictWriter(output, fieldnames=fieldnames, dialect="excel-mac") diff --git a/processors/filtering/write_annotations.py b/processors/filtering/write_annotations.py deleted file mode 100644 index 7fb8f2ee4..000000000 --- a/processors/filtering/write_annotations.py +++ /dev/null @@ -1,107 +0,0 @@ -""" -Write annotations to a dataset -""" -from processors.filtering.base_filter import BasicProcessor -from common.lib.helpers import UserInput - -__author__ = "Sal Hagen" -__credits__ = ["Sal Hagen"] -__maintainer__ = "Sal Hagen" -__email__ = "4cat@oilab.eu" - - -class WriteAnnotations(BasicProcessor): - """ - Write annotated data from the Explorer to a dataset. - """ - type = "write-annotations" # job type ID - category = "Filtering" # category - title = "Write annotations" # title displayed in UI - description = "Writes annotations from the Explorer to the dataset. Each input field will get a column. This creates a new dataset." # description displayed in UI - - options = { - "to-lowercase": { - "type": UserInput.OPTION_TOGGLE, - "default": False, - "help": "Convert annotations to lowercase" - } - } - - @classmethod - def is_compatible_with(cls, module=None, user=None): - """ - Allow processor on CSV files - - :param module: Module to determine compatibility with - """ - return module.is_top_dataset() and module.get_extension() in ("csv", "ndjson") - - def process(self): - """ - Create a generator to iterate through items that can be passed to create either a csv or ndjson. Use - `for original_item, mapped_item in self.source_dataset.iterate_mapped_items(self)` to iterate through items - and yield `original_item`. - - :return generator: - """ - # Load annotation fields and annotations - annotations = self.dataset.get_annotations() - annotation_fields = self.dataset.get_annotation_fields() - - # If there are no fields or annotations saved, we're done here - if not annotation_fields: - self.dataset.update_status("This dataset has no annotation fields saved.") - self.dataset.finish(0) - return - if not annotations: - self.dataset.update_status("This dataset has no annotations saved.") - self.dataset.finish(0) - return - - annotation_labels = [v["label"] for v in annotation_fields.values()] - - to_lowercase = self.parameters.get("to-lowercase", False) - annotated_posts = set(annotations.keys()) - post_count = 0 - - # We first need to get a list of post IDs to create a list of new data. - # This is somewhat redundant since we'll have to loop through the dataset - # multiple times. - - # Create dictionary with annotation labels as keys and lists of data as values - new_data = {annotation_label: [] for annotation_label in annotation_labels} - - for item in self.source_dataset.iterate_items(self): - post_count += 1 - - # Do some loops so we have empty data for all annotation fields - if str(item["id"]) in annotations: - - for label in annotation_labels: - if label in annotations[item["id"]]: - annotation = annotations[item["id"]][label] - - # We join lists (checkboxes) - if isinstance(annotation, list): - annotation = ", ".join(annotation) - # Convert to lowercase if indicated - if to_lowercase: - annotation = annotation.lower() - - new_data[label].append(annotation) - else: - new_data[label].append("") - else: - for label in annotation_labels: - new_data[label].append("") - - if post_count % 2500 == 0: - self.dataset.update_status("Processed %i posts" % post_count) - self.dataset.update_progress(post_count / self.source_dataset.num_rows) - - # Write to top dataset - for label, values in new_data.items(): - self.add_field_to_parent("annotation_" + label, values, which_parent=self.source_dataset, update_existing=True) - - self.dataset.update_status("Annotations written to parent dataset.") - self.dataset.finish(self.source_dataset.num_rows) \ No newline at end of file diff --git a/processors/machine_learning/gpt.py b/processors/machine_learning/gpt.py new file mode 100644 index 000000000..d37be4e49 --- /dev/null +++ b/processors/machine_learning/gpt.py @@ -0,0 +1,279 @@ +""" +Prompt OpenAI GPT LLMs. +""" + +import json +import re +import openai + +from common.lib.helpers import UserInput +from backend.lib.processor import BasicProcessor +from common.config_manager import config + +class OpenAI(BasicProcessor): + """ + Prompt OpenAI's GPT models + """ + type = "openai-llms" # job type ID + category = "Machine learning" # category + title = "OpenAI LLMs" # title displayed in UI + description = ("Use OpenAI's LLMs (e.g. GPT-4) to generate outputs based on the parent dataset.") # description displayed in UI + extension = "csv" # extension of result file, used internally and in UI. In this case it's variable! + + references = [ + "[OpenAPI documentation](https://platform.openai.com/docs/concepts)", + "[Karjus, Andres. 2023. 'Machine-assisted mixed methods: augmenting humanities and social sciences " + "with artificial intelligence.' arXiv preprint arXiv:2309.14379.]" + "(https://arxiv.org/abs/2309.14379)", + "[Törnberg, Petter. 2023. 'How to Use LLMs for Text Analysis.' arXiv:2307.13106.](https://arxiv.org/pdf/2307.13106)"] + + config = { + "api.openai.api_key": { + "type": UserInput.OPTION_TEXT, + "default": "", + "help": "OpenAI API key", + "tooltip": "Can be created on platform.openapi.com" + } + } + + @classmethod + def get_options(cls, parent_dataset=None, user=None): + options = { + "per_item": { + "type": UserInput.OPTION_INFO, + "help": "Outputs are generated per row in the parent dataset. Use [brackets] with a column name to " + "indicate where and what dataset value you want to use, e.g.: 'Determine the language of the " + "following text: [body]').", + }, + "ethics_warning1": { + "type": UserInput.OPTION_INFO, + "help": "Before running a prompt on a large dataset, it is recommended to first create a sample and " + "test the prompt on a handful of rows. You can sample your dataset with the filter processors" + " on this page." + }, + "model": { + "type": UserInput.OPTION_CHOICE, + "help": "Model", + "options": { + "gpt-4o-mini": "GPT-4o mini", + "gpt-4o": "GPT-4o", + "gpt-4-turbo": "GPT-4 turbo", + "o1-mini": "o1-mini", + "custom": "Custom (fine-tuned) model" + }, + "default": "gpt-4o-mini" + }, + "custom_model_info": { + "type": UserInput.OPTION_INFO, + "requires": "model==custom", + "help": "[You can fine-tune a model on the OpenAI portal to improve your prompt results](" + "https://platform.openai.com/docs/guides/fine-tuning). With fine-tuned models, examples in the " + "prompt ('few-shot learning') may not be necessary anymore." + }, + "custom_model": { + "type": UserInput.OPTION_TEXT, + "help": "Model ID", + "requires": "model==custom", + "tooltip": "In the format ft:[modelname]:[org_id]:[custom_suffix]:[id]. See link above" + }, + "prompt": { + "type": UserInput.OPTION_TEXT_LARGE, + "help": "Prompt", + "tooltip": "See the academic references for this processor on best practices for LLM prompts" + }, + "temperature": { + "type": UserInput.OPTION_TEXT, + "help": "Temperature", + "default": 0.5, + "tooltip": "The temperature hyperparameter indicates how strict the model will gravitate towards the next " + "predicted word with the highest probability. A score close to 0 returns more predictable " + "outputs while a score close to 1 leads to more creative outputs." + }, + "max_tokens": { + "type": UserInput.OPTION_TEXT, + "help": "Max output tokens", + "default": 50, + "tooltip": "As a rule of thumb, one token generally corresponds to ~4 characters of " + "text for common English text." + }, + "ethics_warning2": { + "type": UserInput.OPTION_INFO, + "help": "Be very sensitive with running this processor on your datasets, as data will be " + "sent to OpenAI." + }, + "ethics_warning3": { + "type": UserInput.OPTION_INFO, + "help": "Always consider anonymising your data or choosing an open-source LLM host." + }, + "consent": { + "type": UserInput.OPTION_TOGGLE, + "help": "I understand that my data is sent to OpenAI and that OpenAI may incur costs.", + "default": False, + } + } + + # Allow adding prompt answers as annotations to the top-level dataset + # if this is a direct child + if parent_dataset and parent_dataset.is_top_dataset(): + options["write_annotations"] = { + "type": UserInput.OPTION_TOGGLE, + "help": "Add output as annotations to the parent dataset.", + "default": True + } + options["annotation_label"] = { + "type": UserInput.OPTION_TEXT, + "help": "Annotation label", + "default": "", + "requires": "write_annotations==true" + } + + api_key = config.get("api.openai.api_key", user=user) + if not api_key: + options["api_key"] = { + "type": UserInput.OPTION_TEXT, + "default": "", + "help": "OpenAI API key", + "tooltip": "Can be created on platform.openapi.com" + } + + return options + + @classmethod + def is_compatible_with(cls, module=None, user=None): + """ + Determine if processor is compatible with a dataset or processor + + :param module: Module to determine compatibility with + """ + + return module.get_extension() in ["csv", "ndjson"] + + def process(self): + + consent = self.parameters.get("consent", False) + if not consent: + self.dataset.finish_with_error("You must consent to your data being sent to OpenAI first") + self.dataset.delete_parameter("consent") + + model = self.parameters.get("model") + if model == "custom": + if not self.parameters.get("custom_model", ""): + self.dataset.finish_with_error("You must provide a valid ID for your custom model") + else: + custom_model_id = self.parameters.get("custom_model", "") + self.parameters["model"] = custom_model_id + model = custom_model_id + + api_key = self.parameters.get("api_key") + if not api_key: + api_key = config.get("api.openai.api_key", user=self.owner) + if not api_key: + self.dataset.finish_with_error("You need to provide a valid API key") + return + + try: + temperature = float(self.parameters.get("temperature")) + except ValueError: + self.dataset.finish_with_error("Temperature must be a number") + + try: + max_tokens = int(self.parameters.get("max_tokens")) + except ValueError: + self.dataset.finish_with_error("Max tokens must be a number") + + self.dataset.delete_parameter("api_key") # sensitive, delete after use + + base_prompt = self.parameters.get("prompt", "") + self.dataset.update_status("Prompt: %s" % base_prompt) + + if not base_prompt: + self.dataset.finish_with_error("You need to insert a valid prompt") + return + + replacements = re.findall(r"\[.*?\]", base_prompt) + if not replacements: + self.dataset.finish_with_error("You need to provide the prompt with input values using [brackets] of " + "column names") + + write_annotations = self.parameters.get("write_annotations", False) + if write_annotations: + label = self.parameters.get("annotation_label", "") + if not label: + label = model + " output" + + annotations = [] + + results = [] + + # initiate + client = openai.OpenAI(api_key=api_key) + i = 1 + + for item in self.source_dataset.iterate_items(): + + # Replace with dataset values + prompt = base_prompt + for replacement in replacements: + try: + field_name = str(item[replacement[1:-1]]).strip() + prompt = prompt.replace(replacement, field_name) + except KeyError as e: + self.dataset.finish_with_error("Field %s could not be found in the parent dataset" % str(e)) + + try: + response = self.prompt_gpt(prompt, client, model=model, temperature=temperature, max_tokens=max_tokens) + except openai.NotFoundError as e: + self.dataset.finish_with_error(e.message) + return 0 + except openai.BadRequestError as e: + self.dataset.finish_with_error(e.message) + return 0 + + if "id" in item: + item_id = item["id"] + elif "item_id" in item: + item_id = item["item_id"] + else: + item_id = str(i) + + response = response.choices[0].message.content + results.append({ + "id": item_id, + "prompt": prompt, + model + " output": response + }) + + if write_annotations: + annotation = { + "label": label, + "item_id": item_id, + "value": response, + "type": "textarea" + } + annotations.append(annotation) + + self.dataset.update_status("Generated output for item %s/%s" % (i, self.source_dataset.num_rows)) + i += 1 + + # Write annotations + if write_annotations: + self.write_annotations(annotations, overwrite=True) + + # Write to csv file + self.write_csv_items_and_finish(results) + + @staticmethod + def prompt_gpt(prompt, client, model="gpt-4-turbo", temperature=0.2, max_tokens=50): + + # Get response + response = client.chat.completions.create( + model=model, + temperature=temperature, + max_tokens=max_tokens, + messages=[{ + "role": "user", + "content": prompt + }] + ) + + return response diff --git a/processors/machine_learning/perspective.py b/processors/machine_learning/perspective.py new file mode 100644 index 000000000..7dacdf8e2 --- /dev/null +++ b/processors/machine_learning/perspective.py @@ -0,0 +1,159 @@ +""" +Get the toxicity score for items via Perspective API. +""" +import json + +from googleapiclient.errors import HttpError + +from common.lib.helpers import UserInput +from backend.lib.processor import BasicProcessor +from googleapiclient import discovery +from common.lib.item_mapping import MappedItem +from common.config_manager import config + +class Perspective(BasicProcessor): + """ + Score items with toxicity and other scores through Google Jigsaw's Perspective API. + """ + type = "perspective" # job type ID + category = "Machine learning" # category + title = "Toxicity scores" # title displayed in UI + description = ("Use the Perspective API to score text with attributes on toxicity, " + "including 'toxicity', 'insult', and 'profanity'.") # description displayed in UI + extension = "ndjson" # extension of result file, used internally and in UI + + references = [ + "[Perspective API documentation](https://developers.perspectiveapi.com/s/about-the-api)", + "[Rieder, Bernhard, and Yarden Skop. 2021. 'The fabrics of machine moderation: Studying the technical, " + "normative, and organizational structure of Perspective API.' Big Data & Society, 8(2).]" + "(https://doi.org/10.1177/20539517211046181)" + ] + + config = { + "api.google.api_key": { + "type": UserInput.OPTION_TEXT, + "default": "", + "help": "Google API key", + "tooltip": "Can be created on console.cloud.google.com" + } + } + + @classmethod + def get_options(cls, parent_dataset=None, user=None): + options = { + "attributes": { + "type": UserInput.OPTION_MULTI, + "help": "Attributes to score", + "options": { + "TOXICITY": "Toxicity", + "SEVERE_TOXICITY": "Severe toxicity", + "IDENTITY_ATTACK": "Identity attack", + "INSULT": "Insult", + "PROFANITY": "Profanity", + "THREAT": "Threat" + }, + "default": ["TOXICITY"] + }, + "write_annotations": { + "type": UserInput.OPTION_TOGGLE, + "help": "Add attribute scores as annotations to the parent dataset.", + "default": True + } + } + + api_key = config.get("api.google.api_key", user=user) + if not api_key: + options["api_key"] = { + "type": UserInput.OPTION_TEXT, + "default": "", + "help": "Google API key", + "tooltip": "Can be created on console.cloud.google.com" + } + + return options + + def process(self): + + api_key = self.parameters.get("api_key") + self.dataset.delete_parameter("api_key") # sensitive, delete after use + if not api_key: + api_key = config.get("api.google.api_key", user=self.owner) + if not api_key: + self.dataset.finish_with_error("You need to provide a valid API key") + return + + if not self.parameters.get("attributes"): + self.dataset.finish_with_error("You need to provide a at least one attribute to score") + return + + write_annotations = self.parameters.get("api_key", True) + + try: + client = discovery.build( + "commentanalyzer", + "v1alpha1", + developerKey=api_key, + discoveryServiceUrl="https://commentanalyzer.googleapis.com/$discovery/rest?version=v1alpha1", + static_discovery=False, + ) + except HttpError as e: + error = json.loads(e.content)["error"]["message"] + self.dataset.finish_with_error(error) + + results = [] + annotations = [] + api_attributes = {attribute: {} for attribute in self.parameters["attributes"]} + + for item in self.source_dataset.iterate_items(self.source_file): + + if item["body"]: + + analyze_request = { + "comment": {"text": item["body"]}, + "requestedAttributes": api_attributes + } + + try: + response = client.comments().analyze(body=analyze_request).execute() + except HttpError as e: + self.dataset.update_status(str(e)) + continue + + response["item_id"] = item["id"] + response["body"] = item["body"] + results.append(response) + + if write_annotations: + for attribute in self.parameters["attributes"]: + annotation = { + "label": attribute, + "item_id": item["id"], + "value": response["attributeScores"][attribute]["summaryScore"]["value"], + } + annotations.append(annotation) + + # Write annotations + if write_annotations: + self.write_annotations(annotations, overwrite=True) + + # Write to file + with self.dataset.get_results_path().open("w", encoding="utf-8", newline="") as outfile: + for result in results: + outfile.write(json.dumps(result) + "\n") + + self.dataset.finish(len(results)) + + @staticmethod + def map_item(item): + + attribute_scores = {} + all_attributes = ["TOXICITY", "SEVERE_TOXICITY", "IDENTITY_ATTACK", "INSULT", "PROFANITY", "THREAT"] + for att in all_attributes: + if att in item["attributeScores"]: + attribute_scores[att] = item["attributeScores"][att]["summaryScore"]["value"] + + return MappedItem({ + "item_id": item["item_id"], + "body": item.get("body"), + **attribute_scores + }) diff --git a/processors/machine_learning/text_from_image.py b/processors/machine_learning/text_from_image.py index f8fa4d645..def05e411 100644 --- a/processors/machine_learning/text_from_image.py +++ b/processors/machine_learning/text_from_image.py @@ -11,7 +11,7 @@ from common.config_manager import config from common.lib.dmi_service_manager import DmiServiceManager, DsmOutOfMemory, DmiServiceManagerException -from common.lib.helpers import UserInput, convert_to_int +from common.lib.helpers import UserInput from backend.lib.processor import BasicProcessor from common.lib.exceptions import ProcessorInterruptedException, ProcessorException from common.lib.item_mapping import MappedItem diff --git a/processors/metrics/annotation_metadata.py b/processors/metrics/annotation_metadata.py new file mode 100644 index 000000000..8f0b3447e --- /dev/null +++ b/processors/metrics/annotation_metadata.py @@ -0,0 +1,49 @@ +""" +Retrieves metadata on annotations for this dataset. +""" + +from backend.lib.processor import BasicProcessor + +from datetime import datetime + +class AnnotationMetadata(BasicProcessor): + """ + Download annotation metadata from this dataset + """ + type = "annotation-metadata" # job type ID + category = "Post metrics" # category + title = "Annotation metadata" # title displayed in UI + description = ("Download metadata about annotations for this dataset. " + "Includes information like annotation author, timestamp, type, etc.") # description displayed in UI + extension = "csv" # extension of result file, used internally and in UI + + @classmethod + def is_compatible_with(cls, module=None, user=None): + """ + Only compatible with datasets that have annotations. + + :param module: Module to determine compatibility with + """ + + return module.is_dataset() and module.has_annotations() + + def process(self): + + annotation_metadata = self.source_dataset.get_annotation_metadata() + + if not annotation_metadata: + self.dataset.finish_with_error("No annotations made for this dataset") + + for row in annotation_metadata: + timestamp = row["timestamp"] + timestamp_created = row["timestamp_created"] + row["timestamp"] = self.to_date_str(timestamp) + row["epoch_timestamp"] = timestamp + row["timestamp_created"] = self.to_date_str(timestamp_created) + row["epoch_timestamp_created"] = timestamp_created + + self.write_csv_items_and_finish(annotation_metadata) + + @staticmethod + def to_date_str(epoch_timestamp) -> str: + return datetime.strftime(datetime.utcfromtimestamp(int(epoch_timestamp)), "%Y-%m-%d %H:%M:%S") \ No newline at end of file diff --git a/processors/metrics/count_posts.py b/processors/metrics/count_posts.py index f2c70805a..239af6719 100644 --- a/processors/metrics/count_posts.py +++ b/processors/metrics/count_posts.py @@ -67,6 +67,7 @@ def process(self): counter = 0 for post in self.source_dataset.iterate_items(self): + try: date = get_interval_descriptor(post, timeframe) except ValueError as e: diff --git a/setup.py b/setup.py index 0e4e536f1..1a28cdf6b 100644 --- a/setup.py +++ b/setup.py @@ -35,6 +35,7 @@ "networkx~=2.8.0", "numpy>=1.19.2", "opencv-python>=4.6.0.66", + "openai", "packaging", "pandas==1.5.3", "Pillow>=10.3", diff --git a/webtool/__init__.py b/webtool/__init__.py index 4c243d7a7..8973300a4 100644 --- a/webtool/__init__.py +++ b/webtool/__init__.py @@ -110,9 +110,11 @@ import webtool.views.views_extensions import webtool.views.views_restart import webtool.views.views_user + import webtool.views.views_dataset import webtool.views.views_misc -import webtool.views.api_explorer +import webtool.views.views_explorer + import webtool.views.api_standalone import webtool.views.api_tool diff --git a/webtool/lib/template_filters.py b/webtool/lib/template_filters.py index c1ec867a6..d3ba68314 100644 --- a/webtool/lib/template_filters.py +++ b/webtool/lib/template_filters.py @@ -10,14 +10,17 @@ import os import re import requests +import regex from urllib.parse import urlencode, urlparse from webtool import app, config from common.lib.helpers import timify_long from common.config_manager import ConfigWrapper +from pathlib import Path from flask import request from flask_login import current_user +from ural import urls_from_text @app.template_filter('datetime') def _jinja2_filter_datetime(date, fmt=None, wrap=True): @@ -206,6 +209,44 @@ def _jinja2_filter_extension_to_noun(ext): else: return "item" + +@app.template_filter('social_mediafy') +def _jinja2_filter_social_mediafy(body, datasource=""): + # Adds links to a text body with hashtags, @-mentions, and URLs + # A data source must be given to generate the correct URLs. + + if not datasource: + return body + + # Base URLs after which tags and @-mentions follow, per platform + base_urls = { + "twitter": { + "hashtag": "https://twitter.com/hashtag/", + "mention": "https://twitter.com/" + }, + "tiktok": { + "hashtag": "https://tiktok.com/tag/", + "mention": "https://tiktok.com/@" + }, + "instagram": { + "hasthag": "https://instagram.com/explore/tags/", + "mention": "https://instagram.com/" + }, + "tumblr": { + "mention": "https://tumblr.com/", + "markdown": True + # Hashtags aren't linked in the post body + }, + "linkedin": { + "hashtag": "https://linkedin.com/feed/hashtag/?keywords=", + "mention": "https://linkedin.com/in/" + }, + "telegram": { + "markdown": True + } + } + + @app.template_filter("ellipsiate") def _jinja2_filter_ellipsiate(text, length, inside=False, ellipsis_str="…"): if len(text) <= length: @@ -375,6 +416,53 @@ def _jinja2_filter_post_field(field, post): return formatted_field + # Supported data sources + known_datasources = list(base_urls.keys()) + if datasource not in known_datasources: + return body + + # Add URL links + if not base_urls[datasource].get("markdown"): + for url in urls_from_text(body): + body = re.sub(url, "%s" % (url, url), body) + + # Add hashtag links + if "hashtag" in base_urls[datasource]: + tags = re.findall(r"#[\w0-9]+", body) + # We're sorting tags by length so we don't incorrectly + # replace tags that are a substring of another, longer tag. + tags = sorted(tags, key=lambda x: len(x), reverse=True) + for tag in tags: + # Match the string, but not if it's preceded by a >, which indicates that we've already added an anchor tag. + body = re.sub(r"(?)(" + tag + ")", "%s" % (base_urls[datasource]["hashtag"] + tag[1:], tag), body) + + # Add @-mention links + if "mention" in base_urls[datasource]: + mentions = re.findall(r"@[\w0-9-]+", body) + mentions = sorted(mentions, key=lambda x: len(x), reverse=True) + for mention in mentions: + body = re.sub(r"(?)(" + mention + ")", "%s" % (base_urls[datasource]["mention"] + mention[1:], mention), body) + + return body + +@app.template_filter('string_counter') +def _jinja2_filter_string_counter(string, emoji=False): + # Returns a dictionary with counts of characters in a string. + # Also handles emojis. + + # We need to convert multi-character emojis ("graphemes") to one character. + if emoji == True: + string = regex.finditer(r"\X", string) # \X matches graphemes + string = [m.group(0) for m in string] + + # Count 'em + counter = {} + for s in string: + if s not in counter: + counter[s] = 0 + counter[s] += 1 + + return counter @app.template_filter('parameter_str') def _jinja2_filter_parameter_str(url): diff --git a/webtool/static/css/dataset-page.css b/webtool/static/css/dataset-page.css index 9eae3229b..21d03c206 100644 --- a/webtool/static/css/dataset-page.css +++ b/webtool/static/css/dataset-page.css @@ -198,6 +198,12 @@ article.result > section:first-child { line-height: 1.3em; } +.button-like-small.disabled { + cursor: not-allowed; + opacity: 0.5; +} + + .dataset-owner-list li { display: inline-block; } @@ -225,6 +231,7 @@ article.result > section:first-child { background: var(--gray-light); border: 1px solid var(--gray-dark); font-size: 0.8em; + cursor: pointer; } .dataset-toolbox a:hover, a.button-like-small:hover { @@ -319,6 +326,7 @@ article.result > section:first-child { } .child-wrapper .button-wrap.processor-result-indicator a { + display: inline-block; color: inherit; } @@ -333,6 +341,7 @@ article.result > section:first-child { } .child-wrapper .button-wrap .byline { + cursor: default; font-size: 0.8em; } @@ -410,7 +419,7 @@ article.result > section:first-child { display: none; } -.child-list .processor-header .property-badge { +.child-list .processor-header .property-badge, p[role=tooltip] .property-badge { border: 0; background: var(--gray); display: inline; @@ -691,4 +700,110 @@ body.image-preview { #image-zoom:checked + label img { max-height: none; cursor: zoom-out; +} + +/* EXPLORER VIEW */ +#annotation-fields-editor { + height: 0; + overflow-y: hidden; +} + +#annotation-fields-editor-controls { +} + +#annotation-fields-editor-controls li { + position: relative; + background: white; +} + +#annotation-fields-editor-controls li:not(:last-child) { + border-bottom: 1px dotted var(--contrast-dark); +} + +.annotation-field > span, .annotation-field > i, .annotation-field > select { + vertical-align: top; +} + +.annotation-field > i { + padding-top: 10px; +} + +.option-fields { + display: inline-block; + max-width: 250px; +} + +.option-field { + display: inline-block; +} + +#edit-annotation-fields #input-warning { + color: var(--accent-error); +} + +/* Remove all styles for Explorer posts */ +/* these ought to be defined specifically, */ +/* and 4CAT styles shouldn't interfere. */ +#explorer-posts, #explorer-posts > ol li { + all: initial; + padding: 0; +} + +.explorer-content-container { + margin-top: 1em; +} + +#explorer-content { + all: initial; + padding: 0; +} + +.post-annotations { + height: 0; + overflow: hidden; + box-sizing: border-box; + border-width: 0; +} + +.annotation-control-button { + min-width: 140px; +} + +.annotation-field-label.invalid, .option-field > input.invalid { + border: 1px solid red; +} + +#edit-annotation-fields { + padding: 0.5em +} + +.post-annotations p[role="tooltip"] .property-badge { + display: inline-block; + margin: 2px 0px 2px 0px; + background-color: white; + border: 1px solid black; +} + +#save-annotations-notice { + position: fixed; + background-color: var(--accent-okay); + color: var(--contrast-bright); + display: none; + right: 20px; + bottom: 64px; + width: 200px; + text-align: center; + padding: 10px 5px 10px 5px; + border-radius: 10px; +} + +.delete-input { + float: right; +} + +.spinner { + animation: fa-spin; + animation-duration: 1000ms; + animation-iteration-count: infinite; + animation-timing-function: linear; } \ No newline at end of file diff --git a/webtool/static/css/explorer.css b/webtool/static/css/explorer.css deleted file mode 100644 index a3a60c0b2..000000000 --- a/webtool/static/css/explorer.css +++ /dev/null @@ -1,512 +0,0 @@ -/* - -See https://github.com/digitalmethodsinitiative/4cat/wiki/Exploring-and-annotating-datasets for information on how to add custom CSS. - - */ - -@import url('colours.css'); - -@font-face { - font-family: 'Open Sans'; - src: url("../fonts/OpenSans-Regular.ttf") -} - -@font-face { - font-family: 'Open Sans'; - font-weight: bold; - src: url("../fonts/OpenSans-Bold.ttf") -} - -@font-face { - font-family: 'Open Sans'; - font-style: italic; - src: url("../fonts/OpenSans-Italic.ttf") -} - -@font-face { - font-family: 'Open Sans'; - font-weight: bold; - font-style: italic; - src: url("../fonts/OpenSans-BoldItalic.ttf") -} - -/** --------------------- * - Basic HTML elements - * --------------------- */ -*, body, code, select, input, textarea { - font-family: 'Open Sans', 'Trebuchet MS', sans-serif; -} - -body { - background-color: #f9fbff; - margin: 0; - padding: 0; -} - -select, input, textarea { - background: var(--gray); - border: 1px solid var(--gray); - font-size: 14px; - padding: 0.25em; -} - -label { - font-size: 14px; -} - -button { - border: 2px solid var(--contrast-dark); - background: var(--contrast-bright); - color: var(--text); - border-radius: 0.5em; - font-size: 14px; - cursor: pointer; - padding: 0.25em 1em; -} - -button:hover { - background: var(--accent); - color: var(--contrast-bright); -} - -button.invalid, button.invalid:hover { - cursor: default; - background: var(--contrast-bright); - color: var(--text); - border-color: var(--gray-dark); -} - -textarea { - width: 340px; -} - -/** --------------------- * - Header - * --------------------- */ - -body > header { - width: 100%; - margin: 0; - padding: 0; -} - -body > header h1 { - box-shadow: 0 5px 10px #888; - margin: 0; - padding: 0; - font-size: 1.5em; - background: var(--contrast-dark); - color: var(--contrast-bright); - text-align: center; - font-size: 1.5em; - line-height: 1.5em; - font-weight: bold; - padding: 0.50em 0; - cursor: default; -} - -body > header #metadata { - font-size: 16px; - min-width: 640px; - max-width: 960px; - margin: 0 auto; - margin-top: 40px; - margin-bottom: 40px; - text-align: center; -} - -body > header #metadata #parameters > span { - font-family: monospace; - font-size: 12px; - display: inline-block; - background: white; - margin: 2px; - padding: 4px; - border: 1px solid black; - border-radius: 5px; - cursor: default; -} - -body > header .return a { - position: absolute; - left: 0; - padding-left: 12px; - color: white; - font-size: 0.6em; - text-decoration: none; -} - -#dataset-key { - display: none; -} - -/** --------------------- * - Navigation pages - * --------------------- */ -.nav-pages { - text-align: center; -} - -span.page { - display: inline-block; - padding: 10px; - min-width: 20px; - overflow: hidden; - color: black; - background-color: white; - font-family: monospace; - border: 1px solid black; -} - -span.page.selected { - color: white; - background-color: black; -} - -/** --------------------- * - Posts - * --------------------- */ -.posts { - list-style: none; - padding: 0; -} - -.posts li.post { - position: relative; - background: #d1e2ff; - padding: 1em; - min-width: 640px; - max-width: 960px; - margin: 0 auto; - margin-bottom: 1em; - overflow-x: hidden; -} - -.posts header { - border-bottom: 1px solid #000; - display: inline; - cursor: default; -} - -.posts article { - margin-top: 0.5em; -} - -.posts article .post-content { - display: inline; - vertical-align: top; -} - -.posts header .author { - font-weight: bold; -} - -.posts header span { - display: inline-block; -} - -.posts header span:not(:last-child)::after { - content: '\2022'; - margin-left: 0.5em; -} - -.posts li.post.op { - border: 2px solid #000; - background: #485ba6; - color: #FFF; -} - -.posts li.post.op header { - border-color: #FFF; -} - -.posts li.post .post-content a { - color: #215bc6; -} - -.posts .external-url { - position: absolute; - bottom: 0; - right: 0; - padding: 10px; -} - -.posts .external-url.deactivated { - color: #cbcbcb; - cursor: not-allowed; -} - -.quote { - color: #ff5750; -} - -.highlight, :target { - outline: 2px solid #ff5750; -} -li a { - color: inherit; - text-decoration: none; -} - -.posts li .post-image img { - width: 100%; -} - -.clear { - clear: both; -} - -span.divider { - font-size: 16px; - color: var(--gray-darker); - cursor: default; -} - -/** --------------------- * - Annotations editor - * --------------------- */ -#annotations-editor-container { - background: rgba(0, 0, 0, .4); - display: none; - height: 100%; - position: fixed; - top: 0; - left: 0; - width: 100%; - z-index: 9; -} - -#annotations-editor { - width: 100px; - height: 500px; - position: absolute; - top:0; - bottom: 0; - left: 0; - right: 0; - margin: auto; - min-height: 100px; - background-color: #fff; - border-radius: 10px; - box-shadow: 10px 10px 60px #555; - width: 95%; - max-width: 620px; - padding: 15px 2%; -} - -#annotation-fields-container { - max-height: 91%; - overflow-y: scroll; -} - -#annotation-fields-container #annotation-headers { - display: flex; - align-items: flex-start; -} - -#annotation-fields-container .annotation-header { - display: inline-block; - font-size: 14px; - font-weight: bold; -} - -#annotation-fields-container .annotation-header#ah-label { - width: 235px; -} -#annotation-fields-container .annotation-header#ah-type { - width: 105px; -} - -#annotation-fields .option-fields { - display: inline-block; -} - -#annotation-fields .option-field { - display: block; -} - -.annotation-fields-row { - display: flex; - align-items: flex-start; - margin-top: 5px; - padding: 2px; -} - -.annotation-fields-row * { - margin-right: 3px; -} - -.annotation-fields-row select, .annotation-fields-row button { - height: 30px; -} - -.annotation-fields-row input { - height: 21px; - margin-bottom:3px; -} - -.annotation-field-title { - display: inline-block; - font-size: 15px; -} - -.annotation-field-title#at-label { - width: 150px; -} - -#annotation-fields input.invalid { - border: 1px solid red; -} - -#annotation-fields i.fas.fa-trash { - margin: 0; -} - -#annotations-input-warning { - font-size: 14px; - color: red; - font-weight: bold; -} - -#annotations-input-warning.hidden { - font-size: 14px; - color: red; - font-weight: bold; -} - -#toggle-annotations { - width: 170px; -} - -#annotations-input-warning.valid{ - color: var(--accent-okay); -} - -#close-annotation-fields { - position: absolute; - right: 0; - padding: 2px; - top: 0; - margin-right: 7px; - cursor: pointer; -} - -#add-annotation-fields { - position: absolute; - bottom: 0; - right: 0; - margin: 15px; - text-align: right; - line-height: 0px; - margin-bottom: 4px; -} - -#add-annotation-fields #notice { - font-size: 12px; - text-align: right; - color: var(--gray-darker); - cursor: default; -} - -#annotation-controls { - position: fixed; - margin: 10px; - right: 0; - bottom: 0; - border-radius: 10px; - background-color: var(--gray); - z-index: 10; - box-shadow: 10px 10px 60px #555; -} - -#annotation-controls #toggle-annotation-controls { - display: inline-block; - height: 50px; - width: 50px; - line-height: 50px; - text-align: center; - vertical-align: middle; - font-size: 15px; - cursor: pointer; -} - -#toggle-annotation-fields { - margin-left: 10px; -} - -#annotation-controls #annotation-controls-buttons { - display: inline-block; -} - -#annotation-controls #annotation-controls-buttons.hidden { - display: none; - transition: display 0s, opacity 0.5s linear; -} - -.spinner { - -webkit-animation: spinner 2s linear infinite; -} - -@-webkit-keyframes spinner { - from{ - -webkit-transform: rotate(0deg); - } - to{ - -webkit-transform: rotate(360deg); - } -} - -/** --------------------- * - Annotation post elements - * --------------------- */ -.post-annotations { - background-color: #bed4f9; - margin-top: 5px; - margin-right: 20px; - padding: 5px; - display: none; -} - -li.post.op > .post-annotations { - background-color: #3b4b8c; -} - -.post-annotation { - line-height: 2.5em; -} - -.post-annotation .annotation-label { - display: inline-block; - vertical-align: middle; - font-weight: bold; - text-align: right; - width: 130px; - margin-right: 5px; - line-height: 1.6em; - overflow-x: hidden; -} - -.post-annotation.checkbox > .post-annotation-options { - display: inline-block; -} - -.post-annotation-options { - display: inline-block; - vertical-align: top; -} - -.post-annotation-options > input { - display: inline-block; -} - -.posts .external-url { -} - -/** --------------------- * - Footer - * --------------------- */ -footer { - text-align: center; - margin-top: 40px; - margin-bottom: 70px; -} \ No newline at end of file diff --git a/webtool/static/css/explorer/default.css b/webtool/static/css/explorer/default.css new file mode 100644 index 000000000..196502cbb --- /dev/null +++ b/webtool/static/css/explorer/default.css @@ -0,0 +1,147 @@ +/* + +See https://github.com/digitalmethodsinitiative/4cat/wiki/Exploring-and-annotating-datasets for information on how to add custom CSS. + + */ + +@import url('colours.css'); + +@font-face { + font-family: 'Open Sans'; + src: url("../fonts/OpenSans-Regular.ttf") +} + +@font-face { + font-family: 'Open Sans'; + font-weight: bold; + src: url("../fonts/OpenSans-Bold.ttf") +} + +@font-face { + font-family: 'Open Sans'; + font-style: italic; + src: url("../fonts/OpenSans-Italic.ttf") +} + +@font-face { + font-family: 'Open Sans'; + font-weight: bold; + font-style: italic; + src: url("../fonts/OpenSans-BoldItalic.ttf") +} + +/** --------------------- * + Posts + * --------------------- */ + +.post { + font-family: "Open Sans"; + list-style: none; + padding: 0; +} + +.posts li.post { + position: relative; + background: #d1e2ff; + padding: 1em; + min-width: 640px; + max-width: 960px; + margin: 0 auto; + margin-bottom: 1em; + overflow-x: hidden; +} + +.posts header { + border-bottom: 1px solid #000; + display: inline; + cursor: default; +} + +.posts article { + margin-top: 0.5em; +} + +.posts article .post-content { + margin-top: 10px; + margin-bottom: 10px; +} + +.posts header .author { + font-weight: bold; +} + +.posts header span { + display: inline-block; +} + +.posts header span:not(:last-child)::after { + content: '\2022'; + margin-left: 0.5em; +} + +.tags, .metrics, .external-url { + color: #3b4f9d; +} + +.posts li.post .post-content a { + color: #215bc6; +} + + +.posts .external-url { + position: absolute; + bottom: 0; + right: 0; + padding: 10px; +} + +.posts .external-url.deactivated { + color: #cbcbcb; + cursor: not-allowed; +} + +.post-media img { + width: 100%; + max-width: 300px; +} + +span.divider { + font-size: 16px; + color: var(--gray-darker); + cursor: default; +} + +/** --------------------- * + Annotation post elements + * --------------------- */ +.post-annotations { + background-color: #bed4f9; +} + +.post-annotation { + padding: 15px; +} + +.post-annotation > .annotation-label { + display: inline-block; + vertical-align: middle; + font-weight: bold; + text-align: right; + min-width: 150px; + margin-right: 5px; + line-height: 1.6em; + overflow-x: hidden; +} + +.post-annotation.checkbox > .post-annotation-options { + display: inline-block; +} + +.post-annotation-options { + display: inline-block; + vertical-align: top; +} + +.post-annotation-options > input { + display: inline-block; +} \ No newline at end of file diff --git a/datasources/dmi-tcat/explorer/dmi-tcat-explorer.css b/webtool/static/css/explorer/dmi-tcat.css similarity index 100% rename from datasources/dmi-tcat/explorer/dmi-tcat-explorer.css rename to webtool/static/css/explorer/dmi-tcat.css diff --git a/datasources/douyin/explorer/douyin-explorer.css b/webtool/static/css/explorer/douyin.css similarity index 100% rename from datasources/douyin/explorer/douyin-explorer.css rename to webtool/static/css/explorer/douyin.css diff --git a/datasources/fourchan/explorer/fourchan-explorer.css b/webtool/static/css/explorer/fourchan.css similarity index 100% rename from datasources/fourchan/explorer/fourchan-explorer.css rename to webtool/static/css/explorer/fourchan.css diff --git a/webtool/static/css/explorer/generic.css b/webtool/static/css/explorer/generic.css new file mode 100644 index 000000000..196502cbb --- /dev/null +++ b/webtool/static/css/explorer/generic.css @@ -0,0 +1,147 @@ +/* + +See https://github.com/digitalmethodsinitiative/4cat/wiki/Exploring-and-annotating-datasets for information on how to add custom CSS. + + */ + +@import url('colours.css'); + +@font-face { + font-family: 'Open Sans'; + src: url("../fonts/OpenSans-Regular.ttf") +} + +@font-face { + font-family: 'Open Sans'; + font-weight: bold; + src: url("../fonts/OpenSans-Bold.ttf") +} + +@font-face { + font-family: 'Open Sans'; + font-style: italic; + src: url("../fonts/OpenSans-Italic.ttf") +} + +@font-face { + font-family: 'Open Sans'; + font-weight: bold; + font-style: italic; + src: url("../fonts/OpenSans-BoldItalic.ttf") +} + +/** --------------------- * + Posts + * --------------------- */ + +.post { + font-family: "Open Sans"; + list-style: none; + padding: 0; +} + +.posts li.post { + position: relative; + background: #d1e2ff; + padding: 1em; + min-width: 640px; + max-width: 960px; + margin: 0 auto; + margin-bottom: 1em; + overflow-x: hidden; +} + +.posts header { + border-bottom: 1px solid #000; + display: inline; + cursor: default; +} + +.posts article { + margin-top: 0.5em; +} + +.posts article .post-content { + margin-top: 10px; + margin-bottom: 10px; +} + +.posts header .author { + font-weight: bold; +} + +.posts header span { + display: inline-block; +} + +.posts header span:not(:last-child)::after { + content: '\2022'; + margin-left: 0.5em; +} + +.tags, .metrics, .external-url { + color: #3b4f9d; +} + +.posts li.post .post-content a { + color: #215bc6; +} + + +.posts .external-url { + position: absolute; + bottom: 0; + right: 0; + padding: 10px; +} + +.posts .external-url.deactivated { + color: #cbcbcb; + cursor: not-allowed; +} + +.post-media img { + width: 100%; + max-width: 300px; +} + +span.divider { + font-size: 16px; + color: var(--gray-darker); + cursor: default; +} + +/** --------------------- * + Annotation post elements + * --------------------- */ +.post-annotations { + background-color: #bed4f9; +} + +.post-annotation { + padding: 15px; +} + +.post-annotation > .annotation-label { + display: inline-block; + vertical-align: middle; + font-weight: bold; + text-align: right; + min-width: 150px; + margin-right: 5px; + line-height: 1.6em; + overflow-x: hidden; +} + +.post-annotation.checkbox > .post-annotation-options { + display: inline-block; +} + +.post-annotation-options { + display: inline-block; + vertical-align: top; +} + +.post-annotation-options > input { + display: inline-block; +} \ No newline at end of file diff --git a/webtool/static/css/explorer/instagram.css b/webtool/static/css/explorer/instagram.css new file mode 100644 index 000000000..80cbcbd58 --- /dev/null +++ b/webtool/static/css/explorer/instagram.css @@ -0,0 +1,111 @@ +* { + color: black; +} + +.posts li.post { + margin: 0 auto; + max-width: 500px; + position: relative; + background-color: white; + font-family: "Segoe UI", Roboto, Helvetica, Arial, sans-serif; + font-size: 14px; + border-bottom: 1px solid #e3e3e3; + list-style-type: none; + padding: 0; +} + +.posts header { + border: none; +} + +.posts header i.verified { + color: #0095f6; +} + +.posts header .location { + font-size: 12px; +} + +.posts article { + min-width: auto; +} + +.posts .media-container { + width: 100%; + position: relative; +} + +.posts .media-container img { + width: 100%; +} + +.posts .media-container .media-bullets { + position: absolute; + width: 100%; + bottom: 5px; + text-align: center; + font-size: 22px; + color: white; + opacity: .6; +} + +.posts header, .posts .post-content { + padding: 20px; +} + +.posts .post .body a, .posts .post a:hover { + color: #00376B; + text-decoration: none; +} + +.time, .posts .post .comments a { + color: #7b7b7b; +} + +.posts .external-url { + position: absolute; + bottom: 0; + right: 0; + padding: 10px; +} + +.posts .post-image { + max-width: 200px; + margin: 0 auto; + margin-top: 30px; + margin-bottom: 30px; +} + +/** --------------------- * + Annotation post elements + * --------------------- */ +.post-annotations { + border-top: 1px solid #e3e3e3; +} + +.post-annotation { + padding: 15px; +} + +.post-annotation > .annotation-label { + display: inline-block; + vertical-align: middle; + text-align: right; + min-width: 150px; + margin-right: 5px; + line-height: 1.6em; + overflow-x: hidden; +} + +.post-annotation.checkbox > .post-annotation-options { + display: inline-block; +} + +.post-annotation-options { + display: inline-block; + vertical-align: top; +} + +.post-annotation-options > input { + display: inline-block; +} \ No newline at end of file diff --git a/webtool/static/css/explorer/linkedin.css b/webtool/static/css/explorer/linkedin.css new file mode 100644 index 000000000..88ee48dfc --- /dev/null +++ b/webtool/static/css/explorer/linkedin.css @@ -0,0 +1,160 @@ +* { + color: black; +} + +body { + background-color: rgb(244, 242, 238); +} + +.posts li.post { + position: relative; + margin: 0 auto; + margin-top: 5px; + margin-bottom: 5px; + padding: 10px; + max-width: 555px; + background-color: white; + color: #2d2d2d; + line-height: 1.5; + font-family: "Segoe UI", Roboto, Helvetica, Arial, sans-serif; + font-size: 14px; + border: 1px solid rgb(227, 226, 223); + border-radius: 9px; + list-style-type: none; +} + +.posts li.post a { + color: #1069c3; + text-decoration: none; + font-weight: bold; +} + +header .inclusion-context { + font-size: 12px; + padding: 5px; + padding-bottom: 12px; + margin-bottom: 5px; + border-bottom: 1px solid #F0F0F0; + color: #707070; +} + +/* Author info */ + +header .metadata { + margin-top: 5px; + margin-bottom: 10px; +} + +header .author-avatar { + float: left; +} + +header .author-avatar img { + width: 48px; + border-radius: 100%; + padding: 5px; + display: inline-block; +} + +header .author-name a { + color: black; + font-size: 14px; +} + +header .author .author-description, header .author .time, .metrics span { + font-size: 12px; + color: #707070; +} + +.posts .external-url { + position: absolute; + top: 0; + right: 0; + padding: 10px; +} + +/* Media */ +.posts li.post .media-container { + position: relative; + width: 100%; + height: 100%; + margin-top: 10px; + margin-bottom: 5px; +} + +.posts li.post .media-container img { + width: 49.5%; + display: inline-block; +} + +.posts li.post .media-container img:first-of-type { + width: 100%; +} + +.video-thumb { + +} + +.play-button { + position: absolute; + top: 50%; + left: 45%; + font-size: 80px; +} + +.play-button i { + color: white; + opacity: .7; +} + +/* Metrics */ + +.metrics { + margin-top: 14px; +} + +.metrics img { + height: 16px; + vertical-align: middle; +} + +.metrics .shares-and-comments { + float: right; +} + + +/** --------------------- * + Annotation post elements + * --------------------- */ +.post-annotations { + background-color: #f2f2f2; + margin-top: 10px; + border-radius: 8px; +} + +.post-annotation { + padding: 15px; +} + +.post-annotation > .annotation-label { + display: inline-block; + vertical-align: middle; + text-align: right; + min-width: 150px; + margin-right: 5px; + line-height: 1.6em; + overflow-x: hidden; +} + +.post-annotation.checkbox > .post-annotation-options { + display: inline-block; +} + +.post-annotation-options { + display: inline-block; + vertical-align: top; +} + +.post-annotation-options > input { + display: inline-block; +} \ No newline at end of file diff --git a/datasources/reddit/explorer/reddit-explorer.css b/webtool/static/css/explorer/reddit.css similarity index 100% rename from datasources/reddit/explorer/reddit-explorer.css rename to webtool/static/css/explorer/reddit.css diff --git a/webtool/static/css/explorer/telegram.css b/webtool/static/css/explorer/telegram.css new file mode 100644 index 000000000..b4789b9e2 --- /dev/null +++ b/webtool/static/css/explorer/telegram.css @@ -0,0 +1,231 @@ +@font-face { + font-family: "Open Sans"; + src: url("../fonts/OpenSans-Regular.ttf") +} + +@font-face { + font-family: "Open Sans"; + font-weight: bold; + src: url("../fonts/OpenSans-Bold.ttf") +} + +* { + font-size: 15px; + line-height: 1.4; +} + +.explorer-content { + background-image: linear-gradient(#6ca587, #c4d18b); + padding-top: 5px; + padding-bottom: 20px; +} + +.explorer-content ol li { + padding: 1px; + background: none; +} + +.posts .post { + font-family: "Open Sans", Arial; + display: block; + position: relative; + max-width: 580px; + list-style-type: none; + margin: 0 auto; +} + +.posts .post .post-container.new-group { + margin-top: 6px; +} + +/* Profile picture */ +.posts .post .profile-picture-container { + display: inline-block; + width: 60px; + vertical-align: top; +} + +.profile-picture { + background-image: linear-gradient(#389ed5, #59c8e2); + border-radius: 100%; + width: 50px; + height: 50px; + line-height: 53px; + float: left; + text-align: center; +} + +.profile-picture .initials { + color: white; + font-size: 23px; + width: 100%; + height: 100%; +} + +/* Post content */ +.posts .post .post-content { + display: inline-block; + max-width: 80%; + list-style-type: none; + background-color: white; + border-radius: 5px 20px 20px 5px; + padding: 12px 17px; + z-index: -1; + overflow: hidden; +} + +.posts .post .post-content.new-group { + border-radius: 0px 20px 20px 5px; +} + +.bubble-left { + position: relative; + margin-right: -5px; + float: right; + z-index: 0; +} + +.author, .author a, .author a:hover { + margin-bottom: 5px; + color: #2984cd; + font-weight: bold; + text-decoration: none; +} + +.posts .post .body { + display: inline; + padding-top: 5px; + padding-bottom: 5px; +} + +.posts .post .body a { + color: #2984cd; +} + +.posts .post .reply_to { + height: 20px; + padding: 5px; + margin-bottom: 2px; + background-color: #e4f1f9; + border-left: 4px solid #2e96d2; + border-radius: 5px; +} + +.media-container { + max-height: 200px; + margin-top: -12px; + margin-left: -17px; + margin-right: -17px; + margin-bottom: 10px; + overflow: hidden; +} + +.media-container img { + margin-top: -155px; + width: 100%; +} + +.post-content.new-group .media-container { + margin-top: 10px; +} + +.post-content.new-group .media-container img { + margin-top: -155px; + border-radius: 0px; +} + +/* Emoji reaction counts */ +.reactions { + margin-top: 3px; + margin-bottom: 3px; +} + +.reaction { + display: inline-block; + color: #168acd; + background-color: #e8f5fc; + font-weight: bold; + border-radius: 15px; + margin-top: 1px; + padding: 4px; + padding-left: 8px; + padding-right: 8px; + font-size: 16px; + vertical-align: middle; +} + +.reaction .reaction-count { + padding-left: 4px; + font-size: 14px; +} + +/* TOD on the bottom of the post */ +.metrics { + display: inline-block; + padding-left: 10px; + float: right; +} + +.metrics span { + font-size: 14px; + padding-left: 3px; + color: #a0acb6; +} + +/* External url button */ + +.external-url i { + color: #168acd; +} + +/* Day indicator between posts */ +.day { + margin: 15px; + text-align: center; + color: white; +} + +.day span { + padding: 5px; + padding-left: 10px; + padding-right: 10px; + background-color: rgba(0,0,0,.3); + border-radius: 20px; +} + +/** --------------------- * + Annotation post elements + * --------------------- */ +.post-annotations { + background-image: linear-gradient(#389ed5, #59c8e2); + color: white; + border-radius: 5px 20px 20px 5px; + margin-left: 63px; +} + +.post-annotation { + padding: 15px; +} + +.post-annotation > .annotation-label { + display: inline-block; + vertical-align: middle; + text-align: right; + min-width: 150px; + margin-right: 5px; + line-height: 1.6em; + overflow-x: hidden; +} + +.post-annotation.checkbox > .post-annotation-options { + display: inline-block; +} + +.post-annotation-options { + display: inline-block; + vertical-align: top; +} + +.post-annotation-options > input { + display: inline-block; +} \ No newline at end of file diff --git a/webtool/static/css/explorer/tiktok.css b/webtool/static/css/explorer/tiktok.css new file mode 100644 index 000000000..5db2a18b1 --- /dev/null +++ b/webtool/static/css/explorer/tiktok.css @@ -0,0 +1,125 @@ +.explorer-content { + background-color: white; + padding-top: 30px; +} + +.posts .post { + font-family: Arial, sans-serif; + font-size: 15px; + width: 580px; + margin: 0 auto; + background-color: white; + list-style-type: none; + border: 1px solid #efefef; + border-radius: 10px; + min-height: 50px; + margin-bottom: 20px; +} + +.posts .post .post-table { + position: relative; + display: table; + table-layout: fixed; +} + +.posts .post .post-table-row { + display: table-row; +} + +.posts .post .profile-picture { + display: table-cell; + width: 11%; + vertical-align: top; +} + +.posts .post .profile-picture img { + border-radius: 100%; +} + +.posts .post .post-content { + display: table-cell; + width: 93%; +} + +.posts .post header { + color: black; + margin-top: 5px; + margin-bottom: 5px; +} + +.posts .post header a { + color: black; +} + +.posts .post .post-content { + display: inline-block; +} + +.posts .post .post-media { + margin-top: 10px; + width: 100%; +} + +.posts .post .post-media img { + width: 100%; + border-radius: 10px; +} + +.metrics { + display: table-cell; + width: 5%; + vertical-align: top; + margin-top: 40px; +} + +.metrics span { + display: inline-block; + width: 100%; + margin-bottom: 20px; +} + +.posts .external-url { + color: rgb(104, 119, 130); +} + +span.hashtag { + color: rgb(29, 155, 240); +} + + +/** --------------------- * + Annotation post elements + * --------------------- */ +.post-annotations { + background-color: rgb(254, 44, 85); + margin-top: 10px; + color: white; + border-radius: 10px; +} + +.post-annotation { + padding: 15px; +} + +.post-annotation > .annotation-label { + display: inline-block; + vertical-align: middle; + text-align: right; + min-width: 150px; + margin-right: 5px; + line-height: 1.6em; + overflow-x: hidden; +} + +.post-annotation.checkbox > .post-annotation-options { + display: inline-block; +} + +.post-annotation-options { + display: inline-block; + vertical-align: top; +} + +.post-annotation-options > input { + display: inline-block; +} \ No newline at end of file diff --git a/datasources/tiktok_urls/explorer/tiktok_urls-explorer.css b/webtool/static/css/explorer/tiktok_urls.css similarity index 100% rename from datasources/tiktok_urls/explorer/tiktok_urls-explorer.css rename to webtool/static/css/explorer/tiktok_urls.css diff --git a/webtool/static/css/explorer/tumblr.css b/webtool/static/css/explorer/tumblr.css new file mode 100644 index 000000000..d792a915f --- /dev/null +++ b/webtool/static/css/explorer/tumblr.css @@ -0,0 +1,361 @@ +/* General stuff */ +.explorer-content { + background-color: #001935; + padding: 20px +} + +.posts li.post { + position: relative; + list-style-type: none; + font-family: Helvetica, sans-serif; + background-color: white; + color: black; + font-size: 16px; + margin: 0 auto; + margin-top: 20px; + padding: 0px; + border-radius: 8px; + max-width: 540px; +} + +/* Author info */ +.author-row { + display: flex; + align-items: center; + padding: 19px; + font-size: 13px; + text-decoration: none; + color: #5e5e5e; + overflow: hidden; +} + +.author { + font-size: 13px; + font-weight: bold; +} + +.author.pseudonymous { + display: inline-block; + width: 32px; + height: 32px; + border-radius: 3px; + color: white; + background-color: #2f4b66; + text-align: center; + vertical-align: middle; +} + +.author-row .author, .author-row a { + color: black; +} + +.author-row .author-avatar { + display: inline-block; +} + +.author-row .author { + display: inline-block; +} + +.author.pseudonymous i { + line-height: 32px; + color: white; +} + +.author-avatar { + width: 32px; +} + +.author-avatar:not(.reblog) { + margin-right: 10px; +} + +.author-avatar img { + border-radius: 3px; + width: 100%; +} + +.author-row.reblog { + border-bottom: 1px solid rgba(0,0,0,0.13); +} + +.reblog-notice { + padding-left: 3px; + padding-right: 3px; +} + +.reblog-icon { + height: 32px; +} + +.reblog-icon i { + background-color: #00cf35; + border-radius: 100%; + color: white; + font-size: 8px; + padding: 4px; + margin-top: 20px; + margin-left: -7px; +} + +/* Media */ +.media-container { + position: relative; + margin: 3px -19px 3px -19px; + overflow-x: hidden; +} + +.media-container img { + width: 100%; +} + +.media-container.video img { + min-height: 300px; + width: 100%; + filter: blur(1.5rem); +} + +.media-container.audio { + margin: initial; +} + +.media-container.audio audio { + width: 100%; +} + +.play-button { + position: absolute; + width: 100%; + top: 38%; + left: 45%; + font-size: 80px; +} + +.play-button i { + color: white; + opacity: .7; +} + +/* Post text content */ +.post-content { + display: block; + padding: 0px 19px 0px 19px; +} + +.post-content.reblog { + padding-bottom: 19px; + line-height: 1.5em; + border-bottom: 1px solid rgba(0,0,0,0.13); +} + +.post-content .body { + padding: 10px 0px 10px 0px; +} + +.post-content h1 { + font-size: 20px; + font-weight: bold; + background: none; + text-align: left; + color: black; +} + +.post-content h2 { + font-size: 16px; + font-weight: bold; +} + +.ask-content { + margin-bottom: 19px; + display: inline-block; + max-width: 450px; +} + +.ask-content .body-ask { + padding: 25px; + background-color: #ededed; +} + +.ask-content p { + margin: 5px 0px 5px 0px; +} + +.ask-content { +} + +.author-ask { + padding-bottom: 3px; +} + +.author-ask-avatar { + display: inline-block; + vertical-align: top; +} + +a.embedded-link:hover { + text-decoration: none; +} + +.embedded-link-box { + padding: 30px; + background-color: #001935; + color: white; + text-align: center; + font-size: 18px; + border-radius: 5px; + margin: 19px 0px 19px 0px; +} + +.embedded-link-box .link-description { + margin-top: 3px; + font-size: 14px; +} + +.poll-question { + font-size: 20px; + padding: 3px 0px 3px 0px; +} + +.poll-answer { + color: white; + background-color: #001935; + margin: 8px; + padding: 8px; + border-radius: 15px; + text-align: center; +} + +.tags { + display: flex; + align-items: center; + padding: 19px 0px 19px 0px; + font-size: 15px; + text-decoration: none; + list-style-type: none; + color: #5e5e5e; + word-break: break-all; +} + +.tags a { + color: #5e5e5e; +} + +.tags li { + padding: 5px 5px 5px 0px; + display: inline-block; + background-color: white; +} + +/* Post footer */ +footer { + margin: 0px 19px 19px 19px; + padding-top: 19px; + border-top: 1px solid rgba(0,0,0,0.13); +} + +.time { + color: #5e5e5e; +} + +.posts .external-url { + color: #00b4fa; + position: absolute; + top: 0; + right: 0; + padding: 15px; +} + +/* Note metrics */ +.notes { +} + +.note-counts { + padding-top: 19px; +} + +.note-count { + display: inline-block; + color: #5a5a5a; + border-radius: 18px; + border: 1px solid #ebebeb; + padding: 9px 18px; +} + +.note-count.total { + font-weight: bold; +} + +/* Replies */ +.replies { + margin-top: 12px; + display: table; +} + +.reply { + background-color: white; + display: table-row; +} + +.reply .author-info { + display: table-cell; +} + +.reply .author-replied-avatar { + vertical-align: middle; + display: table-cell; + padding-right: 10px; +} + +.reply-content { + vertical-align: top; + margin-top: 5px; + margin-bottom: 5px; + border-radius: 18px; + border: 1px solid #ebebeb; + padding: 9px 18px; + font-size: 14px; + color: #5e5e5e; +} + +.reply-content .author { + color: black; + margin-bottom: 5px; +} + +/* Annotation fields */ +.post-annotations { + background-color: #7c5cff; + color: white; + border-radius: 0px 0px 8px 8px; +} + +.post-annotation { + padding: 15px; +} + +.post-annotation input { + border-radius: 5px; +} + +.post-annotation > .annotation-label { + display: inline-block; + vertical-align: middle; + text-align: right; + min-width: 150px; + margin-right: 5px; + line-height: 1.6em; + overflow-x: hidden; +} + +.post-annotation.checkbox > .post-annotation-options { + display: inline-block; +} + +.post-annotation-options { + display: inline-block; + vertical-align: top; +} + +.post-annotation-options > input { + display: inline-block; +} \ No newline at end of file diff --git a/datasources/twitterv2/explorer/twitterv2-explorer.css b/webtool/static/css/explorer/twitter-import.css similarity index 88% rename from datasources/twitterv2/explorer/twitterv2-explorer.css rename to webtool/static/css/explorer/twitter-import.css index 2faf89fab..24199cad8 100644 --- a/datasources/twitterv2/explorer/twitterv2-explorer.css +++ b/webtool/static/css/explorer/twitter-import.css @@ -1,9 +1,3 @@ -/* - -See https://github.com/digitalmethodsinitiative/4cat/wiki/Exploring-and-annotating-datasets for information on how to add custom CSS. - - */ - body { background-color: white; } diff --git a/webtool/static/css/explorer/twitter.css b/webtool/static/css/explorer/twitter.css new file mode 100644 index 000000000..1af0811e4 --- /dev/null +++ b/webtool/static/css/explorer/twitter.css @@ -0,0 +1,130 @@ +.explorer-content { + background-color: white; + padding-top: 30px; +} + +.posts .post { + font-family: Arial, sans-serif; + font-size: 15px; + width: 580px; + margin: 0 auto; + background-color: white; + list-style-type: none; + border: 1px solid #efefef; + border-radius: 10px; + min-height: 50px; + margin-bottom: 20px; +} + +.posts .post .post-table { + position: relative; + display: table; + table-layout: fixed; + width: 100%; +} + +.posts .post .post-table-row { + display: table-row; +} + +.posts .post .profile-picture { + display: table-cell; + width: 11%; + vertical-align: top; +} + +.posts .post .profile-picture img { + border-radius: 100%; +} + +.posts .post .post-content { + display: table-cell; + width: 93%; +} + +.posts .post header { + color: black; + margin-top: 5px; + margin-bottom: 5px; +} + +.posts .post header a { + color: black; +} + +.posts .post .post-content { + display: inline-block; +} + +.posts .post .media-container { + margin-top: 10px; + width: 100%; +} + +.posts .post .media-container img { + width: 100%; + border-radius: 15px; + border: 1px solid #e1e7ea; +} + +.metrics { + margin-top: 20px; +} + +.time, .metrics, .atname, .external-url a { + color: #7a8a97; +} + +.posts .post .metrics { + display: flex; + justify-content: space-between; +} + +span.hashtag { + color: rgb(29, 155, 240); +} + +.post .quote-post { + padding: 15px 20px 15px 20px; + border: 1px solid #efefef; + border-radius: 10px; + margin-top: 5px; +} + + +/** --------------------- * + Annotation post elements + * --------------------- */ +.post-annotations { + background-color: rgb(29, 155, 240); + margin-top: 10px; + color: white; + border-radius: 15px; +} + +.post-annotation { + padding: 15px; +} + +.post-annotation > .annotation-label { + display: inline-block; + vertical-align: middle; + text-align: right; + min-width: 150px; + margin-right: 5px; + line-height: 1.6em; + overflow-x: hidden; +} + +.post-annotation.checkbox > .post-annotation-options { + display: inline-block; +} + +.post-annotation-options { + display: inline-block; + vertical-align: top; +} + +.post-annotation-options > input { + display: inline-block; +} \ No newline at end of file diff --git a/datasources/twitter-import/explorer/twitter-import-explorer.css b/webtool/static/css/explorer/twitterv2.css similarity index 100% rename from datasources/twitter-import/explorer/twitter-import-explorer.css rename to webtool/static/css/explorer/twitterv2.css diff --git a/webtool/static/css/stylesheet.css b/webtool/static/css/stylesheet.css index acc4409be..70a11ba92 100644 --- a/webtool/static/css/stylesheet.css +++ b/webtool/static/css/stylesheet.css @@ -493,6 +493,10 @@ button.tooltip-trigger { border-color: var(--gray-dark); } +p[role=tooltip] > .tooltip-line { + display: inline-block; +} + /** --------------------- * Tab containers @@ -956,6 +960,11 @@ article section.data-overview .description { color: var(--contrast-bright); } +.pagination .details { + margin: 0 auto; + text-align: center; +} + .tabs { border-bottom: 1px dotted var(--contrast-dark); max-height: 5em; @@ -1205,4 +1214,4 @@ ol.result-list li.has_results .property-container.analysis a { .result-list .child-list > li { padding: 0; margin: 0.5em 0 0 0; -} \ No newline at end of file +} diff --git a/webtool/static/explorer-assets/linkedin_reaction_appreciation.svg b/webtool/static/explorer-assets/linkedin_reaction_appreciation.svg new file mode 100644 index 000000000..f8f1c1786 --- /dev/null +++ b/webtool/static/explorer-assets/linkedin_reaction_appreciation.svg @@ -0,0 +1,20 @@ + + + + + + + + + + + + + + + + + + + + diff --git a/webtool/static/explorer-assets/linkedin_reaction_empathy.svg b/webtool/static/explorer-assets/linkedin_reaction_empathy.svg new file mode 100644 index 000000000..1dd57ca73 --- /dev/null +++ b/webtool/static/explorer-assets/linkedin_reaction_empathy.svg @@ -0,0 +1,9 @@ + + + + + + + + + diff --git a/webtool/static/explorer-assets/linkedin_reaction_entertainment.svg b/webtool/static/explorer-assets/linkedin_reaction_entertainment.svg new file mode 100644 index 000000000..ebaa24308 --- /dev/null +++ b/webtool/static/explorer-assets/linkedin_reaction_entertainment.svg @@ -0,0 +1,10 @@ + + + + + + + + + + \ No newline at end of file diff --git a/webtool/static/explorer-assets/linkedin_reaction_interest.svg b/webtool/static/explorer-assets/linkedin_reaction_interest.svg new file mode 100644 index 000000000..a18c215c0 --- /dev/null +++ b/webtool/static/explorer-assets/linkedin_reaction_interest.svg @@ -0,0 +1,11 @@ + + + + + + + + + + + diff --git a/webtool/static/explorer-assets/linkedin_reaction_like.svg b/webtool/static/explorer-assets/linkedin_reaction_like.svg new file mode 100644 index 000000000..37fd40d4a --- /dev/null +++ b/webtool/static/explorer-assets/linkedin_reaction_like.svg @@ -0,0 +1,8 @@ + + + + + + + + diff --git a/webtool/static/explorer-assets/linkedin_reaction_praise.svg b/webtool/static/explorer-assets/linkedin_reaction_praise.svg new file mode 100644 index 000000000..e4a45cf7a --- /dev/null +++ b/webtool/static/explorer-assets/linkedin_reaction_praise.svg @@ -0,0 +1,27 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/webtool/static/img/go-to-media.jpg b/webtool/static/img/go-to-media.jpg new file mode 100644 index 000000000..bf8be3e3d Binary files /dev/null and b/webtool/static/img/go-to-media.jpg differ diff --git a/webtool/static/js/explorer.js b/webtool/static/js/explorer.js index c421cc001..a7889413b 100644 --- a/webtool/static/js/explorer.js +++ b/webtool/static/js/explorer.js @@ -2,9 +2,13 @@ $(document).ready(function(){ $(init); -/** +/* * Page init */ + +// Timer variable to start/reset saving annotations. +var save_timer = null; + function init() { // Functional stuff @@ -15,150 +19,127 @@ function init() { } -/** +/* * Handle annotations */ const annotations = { init: function() { - // Show and hide the annotation controls - $("#toggle-annotation-controls").on("click", function() { - $("#annotation-controls-buttons").toggleClass("hidden"); - if ($("#annotation-controls-buttons").hasClass("hidden")) { - $(this).html(""); - } - else { - $(this).html(""); - } - }); + let editor = $("#annotation-fields-editor"); + let editor_controls = $("#annotation-fields-editor-controls"); // Add a new annotation field when clicking the plus icon - $("#add-annotation-field").on("click", function(){$("#annotation-fields").append(annotations.getAnnotationsDiv);}); - - // Show and hide the annotations editor - $("#toggle-annotation-fields").on("click", function(){$("#annotations-editor-container").toggle();}); - $("#annotations-editor").click(function(e) { - e.stopPropagation(); - }); - - // Keep track of when the annotation fields were edited. - $("#annotation-fields").on("click", "#add-annotation-field, .delete-input, .delete-input i, .delete-option-field, .delete-option-field i", function() { - $("#save-annotation-fields").removeClass("invalid").removeAttr("disabled"); - }); - $("#annotation-fields").on("change keydown", "input, select", function() { - $("#save-annotation-fields").removeClass("invalid").removeAttr("disabled"); + $("#new-annotation-field").on("click", function(){ + annotations.addAnnotationField(); }); - // Close the annotation field editor (ask whether unsaved changes can be discarded) - $("#annotations-editor-container, #close-annotation-fields").click(function(e){ - e.preventDefault(); - if (!$("#save-annotation-fields").prop("disabled")) { - let conf = confirm("Close without applying input fields?"); - if (conf) { - $("#annotations-editor-container").hide(); - $("#annotation-fields").html(old_annotation_fields); - $("#save-annotation-fields").addClass("invalid").prop("disabled", true); - } + // Show and hide the annotations editor + let toggle_fields = $("#toggle-annotation-fields") + toggle_fields.on("click", function(){ + if (toggle_fields.hasClass("shown")) { + $("#toggle-annotation-fields").html(" Edit fields"); + toggle_fields.removeClass("shown"); + editor.animate({"height": 0}, 250); } else { - $("#annotations-editor-container").hide(); + $("#toggle-annotation-fields").html(" Hide editor"); + toggle_fields.addClass("shown"); + // Bit convoluted, but necessary to restore auto height + let current_height = editor.height(); + let auto_height = editor.css("height", "auto").height(); + editor.height(current_height).animate({"height": auto_height}, 250, function(){ + editor.height("auto"); + }); } }); // Show and hide annotations $("#toggle-annotations").on("click", function(){ - if (!$(this).hasClass("invalid")) { - annotations.toggleAnnotations(); + if (!$(this).hasClass("disabled")) { + if ($(this).hasClass("shown")) { + annotations.hideAnnotations(); + } + else { + annotations.showAnnotations(); + } } }); // Delete an entire annotation input - $("#annotation-fields").on("click", ".annotation-field > .delete-input", function(e){$(this).parent().remove();}); + // We're in a grid of threes, so this involves three divs + editor_controls.on("click", ".annotation-field > .delete-input", function(){ + let parent_div = $(this).parent(); + parent_div.remove(); + }); // Make saving available when annotation fields are changed - $("#annotation-fields").on("click", ".annotation-field > .option-fields > .option-field > .delete-option-field", function() { + editor_controls.on("click", ".delete-option-field", function() { annotations.deleteOption(this); }); - $("#annotation-fields").on("change", ".annotation-field > .annotation-field-type", function(e) {annotations.toggleField(e.target);}); + editor_controls.on("change", ".annotation-field-type", function(e) {annotations.toggleField(e.target);}); - // Make enter add a new option field - $("#annotation-fields").on("keypress", "input", function(e){ - if (e.which == 13) { + // Make enter apply the option fields + editor_controls.on("keypress", "input", function(e){ + if (e.which === 13) { annotations.applyAnnotationFields(); } }); - // Save the annotations fields to the database - $("#save-annotation-fields").on("click", annotations.applyAnnotationFields); + // Save the annotation fields to the database + $("#apply-annotation-fields").on("click", annotations.applyAnnotationFields); // Dynamically add a new option field when another is edited - $("#annotation-fields").on("keyup", ".annotation-field > .option-fields > .option-field > input", function(e) { + editor_controls.on("keyup", ".option-field > input", function(e) { if ($(this).val().length > 0) { annotations.addOptions(e.target); } }); - - // Make saving available when annotations are changed - $(".post-annotations").on("keydown", "input, textarea", function() { annotations.enableSaving(); }); - $(".post-annotations").on("click", "option, input[type=checkbox], label", function() { annotations.enableSaving(); }); // Keep track of whether the annotations are edited or not. - $(".post-annotations").on("change", ".post-annotation-input, .post-annotation input, .post-annotation textarea", function(){$(this).addClass("edited")}); - - // Save the annotations to the database - $("#save-annotations").on("click", function(){ - if (!$(this).hasClass("invalid")) { - annotations.saveAnnotations(); + let post_annotations = $(".post-annotations"); + post_annotations.on("keydown keyup change", + ".post-annotation-input, input[type=checkbox], label, option", + function(){ + + let parent = $(this).parent(); + // Navigate one level up if it's a checkbox or dropdown input + if (parent.hasClass("post-annotation-options")) { + parent = parent.parent(); } + annotations.markChanges(parent); }); - $("#save-to-dataset").on("click", function(){ - if (!$(this).hasClass("invalid")) { + // Save the annotations to the database + $("#save-annotations").on("click", function(){ + if (!$(this).hasClass("disabled")) { + clearTimeout(save_timer); + save_timer = null; annotations.saveAnnotations(); - annotations.writeAnnotations(); } }); - // Ask whether the next page should be opened without saving annotations - $('a > .page').click(function(){ - if (!$("#save-annotations").prop('disabled')) { - return confirm("You'll lose unsaved annotations for this page if you don't save first.\nDo you still want to continue?"); - } - }) - - var old_annotation_fields = $("#annotation-fields").html(); - // Check whether there's already fields saved for this dataset annotations.fieldsExist(); - // Save annotations every 10 seconds - setInterval(function() { - annotations.saveAnnotations(); - }, 10000); - }, toggleField: function (el) { - // Change the type of input fields when switching in the dropdown let type = $(el).val(); - let old_type = $(el).attr("data-val"); - - if (type == "text" || type == "textarea") { - $(el).parent().find(".option-fields").remove(); + let options = $(el).parent().parent().find(".option-fields"); + if (type === "text" || type === "textarea") { + options.remove(); } - else if (type == "dropdown" || type == "checkbox") { - if (!($(el).siblings(".option-fields").length) > 0) { - $(el).after("
"); - $(el).next().append(annotations.getInputField); + else if (type === "dropdown" || type === "checkbox") { + if (options.children().length === 0) { + options.append(annotations.getInputField); } } }, addOptions: function (el){ - // Dynamically a new options for dropdowns and checkboxes - + // Dynamically a new options for dropdowns and checkboxes in the fields editor. // If text is added to a field, and there are // no empty fields available, add a new one. let no_empty_fields = true; @@ -168,13 +149,14 @@ const annotations = { no_empty_fields = false; } input_fields.each(function(){ - var input_field = $(this).find("input"); + let input_field = $(this).find("input"); let val = input_field.val(); if (!val.length > 0) { no_empty_fields = false; } }); + // Add a new field if there's no empty ones if (no_empty_fields) { $(el).parent().after(annotations.getInputField); } @@ -192,11 +174,11 @@ const annotations = { count++; // Don't add a delete option for the last (empty) input. - if (count == amount) { + if (count === amount) { return false; } $(this).append(` - `); + `); }); } }, @@ -206,18 +188,20 @@ const annotations = { $(el).parent().remove(); // Make sure you can't delete the last element - if (input_fields.find(".option-field").length == 1) { + if (input_fields.find(".option-field").length === 1) { input_fields.find(".delete-option-field").remove(); } }, - parseAnnotationFields: function (e) { - // Validates and converts the fields in the annotations editor. - // Returns an object with the set annotation fields. + parseAnnotationFields: function () { + /* + Validates and converts the fields in the annotations editor. + Returns an object with the set annotation fields. + */ - annotation_fields = {}; - var warning = ""; - var labels_added = [] + let annotation_fields = {}; + let warning = ""; + let labels_added = [] annotations.warnEditor(""); @@ -226,51 +210,56 @@ const annotations = { // Parse information from the annotations editor. $(".annotation-field").each(function(){ - let label_field = $(this).children(".annotation-field-label"); - let label = label_field.val().replace(/\s+/g, ' ');; + let ann_field = $(this); + + let label_field = ann_field.find(".annotation-field-label"); + let type = ann_field.find(".annotation-field-type").val(); + let option_fields = ann_field.find(".option-fields"); + let label = label_field.val().replace(/\s+/g, ' '); + let no_options_added = false - // Get the random identifier of the field, so we + // Get the ID of the field, so we // can later check if it already exists. - let field_id = parseInt(this.id.split("-")[1]); + let field_id = ann_field.attr("id").split("-")[1]; // Make sure the inputs have a label if (!label.length > 0) { label_field.addClass("invalid"); - warning = "Input names can't be empty"; + warning = "Field labels can't be empty"; } - // Make sure the names can't be duplicates. - if (labels_added.includes(label)) { - warning = "Fields must be unique"; + // Make sure the labels can't be duplicates + else if (labels_added.includes(label)) { + warning = "Field labels must be unique"; label_field.addClass("invalid"); } - // Set the types and values of the annotation - type = $(this).children(".annotation-field-type").val(); + // We can't add field labels that are also existing column names + else if (original_columns.includes(label)) { + warning = "Field label " + label + " is already present as a dataset item, please rename."; + label_field.addClass("invalid"); + } // Keep track of the labels we've added - labels_added.push(label) - - if (type == "text" || type == "textarea") { + labels_added.push(label); + if (type === "text" || type === "textarea") { annotation_fields[field_id] = {"type": type, "label": label}; } // Add options for dropdowns and checkboxes - else { - let options = []; // List of dicts, because it needs to be ordered + else if (option_fields.length > 0) { + let options = new Map(); // Map, because it needs to be ordered let option_labels = []; - let no_options_added = true; - let option_id = "" - $(this).find(".option-field > input").each(function(){ - - let option_label = $(this).val(); - let option_id = this.id.replace("input-", ""); + no_options_added = true; - if (!option_labels.includes(option_label) && option_label.length > 0) { + option_fields.find(".option-field").each(function(){ + let option_input = $(this).find("input"); + let option_label = option_input.val().replaceAll(",", ""); // No commas allowed + let option_id = option_input.attr("id").replace("option-", ""); - // We're using a unique key for these to match input fields. - option = {} - option[option_id] = option_label - options.push(option); + // New option label + if (!option_labels.includes(option_label) && option_label.length > 0) { + // We're using a unique key for options as well. + options.set(option_id, option_label); option_labels.push(option_label); no_options_added = false; } @@ -284,16 +273,15 @@ const annotations = { // But there must be at least one field in there. }); - if (no_options_added) { warning = "At least one field must be added"; - $(this).find(".option-field > input").first().addClass("invalid"); + ann_field.find(".option-fields .option-field input").first().addClass("invalid"); } - if (Object.keys(options).length > 0) { + if (options.size > 0) { // Strip whitespace from the input field key label = label.replace(/\s+/g, ' '); - annotation_fields[field_id] = {"type": type, "label": label, "options": options}; + annotation_fields[field_id] = {"type": type, "label": label, "options": Object.fromEntries(options)}; } } }); @@ -301,33 +289,81 @@ const annotations = { if (warning.length > 0) { return warning; } - console.log(annotation_fields) return annotation_fields; }, + parseAnnotation: function(el) { + /* + Converts the DOM objects of an annotation + to an annotation object. + + Must be given a .post-annotation div element. + */ + + let ann_input = el.find(".post-annotation-input"); + let ann_classes = el.attr("class").split(" "); + let ann_type = ann_classes[2].replace("type-", ""); + let field_id = ann_classes[1].replace("field-", ""); + let item_id = ann_classes[3].replace("item-id-", ""); + let label = el.find(".annotation-label").text(); + let author = el.find(".annotation-author").html(); + let options = el.find(".annotation-options").html(); + let timestamp = parseInt(el.find(".epoch-timestamp-edited").html()); + + let val = undefined; + + // If there are values inserted or things changed, return an annotation object. + // even if the value is an empty string. + + if (ann_type === "text" || ann_type === "textarea") { + val = ann_input.val(); + } else if (ann_type === "dropdown") { + val = $(ann_input).find(":selected").val(); + } else if (ann_type === "checkbox") { + val = []; + el.find(".post-annotation-input").each(function () { + let checkbox = $(this); + if (checkbox.prop("checked") === true) { + val.push(checkbox.val()); + } + }); + } + + // Create an annotation object and add them to the array. + let annotation = { + "field_id": field_id, + "item_id": item_id, + "label": label, + "type": ann_type, + "value": val, + "author": author, + "by_processor": false, // Explorer annotations are human-made! + "timestamp": timestamp, + "options": options, + } + return annotation; + }, + applyAnnotationFields: function (e){ // Applies the annotation fields to each post on this page. // First we collect the annotation information from the editor - var annotation_fields = annotations.parseAnnotationFields(e); - var fields_to_add = {}; - + let new_annotation_fields = annotations.parseAnnotationFields(e); + // Show an error message if the annotation fields were not valid. - if (typeof annotation_fields == 'string') { - annotations.warnEditor(annotation_fields); - return + if (typeof new_annotation_fields == "string") { + annotations.warnEditor(new_annotation_fields); } // If everything is ok, we're going to add // the annotation fields to each post on the page. else { - $("#save-annotation-fields").html(" Applying") - + // Remove warnings annotations.warnEditor("") - $("#annotation-fields").find("input").each(function(){ + $("#annotation-field").find("input").each(function(){ $(this).removeClass("invalid"); }); $(".option-fields").find("input").each(function(){ @@ -335,353 +371,140 @@ const annotations = { }); // We store the annotation fields in the dataset table. - annotations.saveAnnotationFields(annotation_fields) - - // Get the ID (stored as class) of fields we've already added (could be none) - var added_fields = []; - $(".posts li").first().find(".post-annotation").each(function(){ - cls = this.className.split(" ")[1]; - if (!added_fields.includes(cls)){ - added_fields.push(cls); - } - }); - - // Add input fields to every posts in the explorer. - // We take the annotations of the first post to check - // what's the current state and add them to every post after. - let text_fields = ["textarea", "text"]; - - // Loop through all the annotation fields - for (var field in annotation_fields) { - - // Get some variables - let input_type = annotation_fields[field].type; - let input_label = annotation_fields[field].label; - let input_id = "field-" + field; - let class_id = "." + input_id; - - // We first have to check whether this annotation field was already added. - // If this is the case, we're either going to add or convert the fields. - if (added_fields.includes(input_id)) { - - // Edit the labels if they have changed. - label_span = $(class_id + " > .annotation-label"); - label = label_span.first().text(); - if (label != input_label) { - label_span.each(function(){ - $(this).text(input_label); - }); - } - - // If the type of input field has changed, - // we'll convert the data where possible. - // Last class is the input type - let old_input_type = $(class_id).first().attr('class').split(' ').at(-1); - - // If the change is between a textbox and textarea, - // change the input type and carry over the text. - if (input_type != old_input_type) { - - if (text_fields.includes(input_type) && text_fields.includes(old_input_type)) { - - $(class_id + " > .post-annotation-input").each(function(){ - - // Get the old inserted text, if it's present - let add_val = ""; - if ($(this).val().length > 0 && $(this).val() != undefined ){ - add_val = $(this).val(); - } - - // Replace the HTML element, insert old values, and change the type class - if (input_type == "text" && old_input_type == "textarea") { - $(this).parent().removeClass("textarea").addClass("text"); - $(this).replaceWith($("").val(add_val)); - } - else if (input_type == "textarea" && old_input_type == "text") { - $(this).parent().removeClass("text").addClass("textarea"); - $(this).replaceWith($("")); - } - }); - } - - // We don't don't convert for changes between checkboxes and dropdowns - // or between a text input and dropdowns or checkboxes. - // Simply replace the elements. Old data will be lost. - else { - $(class_id).remove(); - fields_to_add[field] = annotation_fields[field]; - } - } - - // For dropdowns and checkboxes, we're checking whether we - // have to add or change any of their options. - else if (input_type == "checkbox" || input_type == "dropdown"){ - - let options = annotation_fields[field].options; - let valid_options = []; - let option_list = $(class_id).find(".post-annotation-options"); - - // Let's take the first post's options as a check. - let existing_options = $(class_id).first(); - - for (let i in options) { - - for (let option_id in annotation_fields[field]["options"][i]) { - - //let option_id = annotation_fields[field]["options"][i][option_id]; - let existing_option = existing_options.find(".option-" + option_id); - let old_label = existing_option.first().val(); - let new_label = annotation_fields[field]["options"][i][option_id]; - - // If this field does not exist yet, add it - if (!existing_option.length) { - - option_list.each(function(){ - // We need a unique ID for the posts's specific option element. - // Else it gets messy with elements across posts. - let post_id = $(this).parents("li").attr("id").split("post-")[1]; - post_option_id = post_id + "-" + option_id; - - if (input_type == "dropdown") { - $(this).append(""); - } - else if (input_type == "checkbox") { - $(this).append(""); - } - }); - } - - // Change the option labels if they have been edited - else if (old_label != new_label) { - $(class_id).find(".option-" + option_id).each(function(){ - $(this).val(new_label).text(new_label); - $(this).next("label").text(new_label); - }); - } - valid_options.push("option-" + option_id); - } - } - - // Delete any fields that were removed from the checkbox/dropdown. - let present_options = []; - option_list.first().find(".post-annotation-input").each(function(){ - if ((this.id).length > 0) { - present_options.push(this.className.replace(" edited", "").split(" ").at(-1)); - } - }); - - for (let z in present_options) { - if (!valid_options.includes(present_options[z])){ - let remove_input = $(class_id).find("." + present_options[z]); - remove_input.next("label").remove(); - remove_input.remove(); - } - } - } - } - - // If this annotation has not been added yet, do so now. - else { - fields_to_add[field] = annotation_fields[field]; - } - } - - // Else we're adding them - for (var add_field in fields_to_add) { - - // Get some variables - let input_type = fields_to_add[add_field].type; - let input_id = "field-" + add_field; - let input_label = fields_to_add[add_field].label - - // Add a label for the field - el = "
"; - - // Add a text input for text fields - if (input_type == "text") { - el += ""; - } - else if (input_type == "textarea") { - el += ""; - } - - // Add a dropdown for dropdown fields - else if (input_type == "dropdown") { - - el += ""; - } - - // Add checkboxes for checkbox fields - else if (input_type == "checkbox") { - - el += "
"; - let options = fields_to_add[add_field].options; - - for (let i in options) { - - for (let option_id in options[i]) { - - option_label = options[i][option_id]; - - el += ""; - } - } - el += "
"; - } - el += "
"; - $(".posts li").each(function(){ - let post_id = this.id.split("post-")[1]; - $(this).find(".post-annotations").append(el.replaceAll("{POST_ID}", post_id)); - }); + // First check if existing annotations are affected. + if (annotation_fields) { + annotations.checkFieldChanges(new_annotation_fields, annotation_fields); } - } - - // Remove annotation forms that are deleted - var valid_fields = []; - for (var f in annotation_fields) { - valid_fields.push("field-" + f); - } - var present_annotations = $(".post-annotations").first().find(".post-annotation") - present_annotations.each(function(){ - let present_id = $(this).attr("class").split(" ")[1]; - if (!valid_fields.includes(present_id)) { - $("." + present_id).remove(); - } - }); - - // Hide annotations if there's no fields leftover - var leftover_annotations = $(".post-annotations").first().find(".post-annotation"); - if (leftover_annotations.length < 1) { - if ($(".post-annotations").first().is(':visible')) { - annotations.toggleAnnotations(); + else { + $("#apply-annotation-fields").html(" Applying") + annotations.saveAnnotationFields(new_annotation_fields); } } - - $("#save-annotation-fields").html(" Apply") }, - saveAnnotationFields: function (annotation_fields){ + saveAnnotationFields: function (new_fields){ // Save the annotation fields used for this dataset // to the datasets table. + // `old fields` can be given to warn the user if changes to existing fields + // will affect annotations, like deleting a field or changing its type. - if (annotation_fields.length < 1 || annotation_fields == undefined) { - annotation_fields = annotation_fields.parseAnnotationFields; - } - - // If there's annotation fields, we can enable/disable the buttons - annotations.fieldsExist(); + let dataset_key = $("#dataset-key").text(); - var dataset_key = $("#dataset-key").text(); - var json_annotations = JSON.stringify(annotation_fields); + if (new_fields.length < 1) { + return; + } // AJAX the annotation forms $.ajax({ url: getRelativeURL("explorer/save_annotation_fields/" + dataset_key), type: "POST", contentType: "application/json", - data: json_annotations, + data: JSON.stringify(new_fields), + success: function () { + // If the query is accepted by the server + // simply reload the page to render the template again. + window.location.replace(window.location.href); + }, + error: function (error) { + console.log(error); - success: function (response) { - // If the query is rejected by the server. - if (response == 'success') { - $("#annotations-editor-container").hide(); - $("#save-annotation-fields").addClass("invalid") - $("#save-annotation-fields").prop("disabled", true); + if (error.status == 400) { + annotations.warnEditor(error.responseJSON.error); } - - // If the query is accepted by the server. else { - annotations.warnEditor("Couldn't save annotation fields"); + annotations.warnEditor("Server error, couldn't save annotation fields.") } - }, - error: function (error) { - annotations.warnEditor(error); + $("#apply-annotation-fields").html(" Apply"); } }); }, - saveAnnotations: function (e){ + checkFieldChanges(new_fields, old_fields) { + + let deleted_fields = []; + let changed_type_fields = []; + + // Warn the user in case fields are deleted or changed from text to choice. + if (old_fields) { + let text_fields = ["text", "textarea"]; + let choice_fields = ["checkbox", "dropdown"]; + + for (let old_field_id in old_fields) { + + // Deleted + if (!(old_field_id in new_fields) || !new_fields) { + deleted_fields.push(old_fields[old_field_id]["label"]); + } else { + let old_type = old_fields[old_field_id]["type"]; + let new_type = new_fields[old_field_id]["type"] + if (old_type !== new_type) { + // Changed from text to choice, or the reverse. + // In this case annotations will be deleted. + // Changes from dropdown to checkbox also result in deleted annotations. + if ((text_fields.includes(old_type) && choice_fields.includes(new_type)) || + (choice_fields.includes(old_type) && text_fields.includes(new_type)) || + (choice_fields.includes(old_type) && choice_fields.includes(new_type))) { + changed_type_fields.push(new_type); + } + } + } + } + } + + // Ask 4 confirmation + if (deleted_fields.length > 0 || changed_type_fields.length > 0) { + let msg = ""; + if (deleted_fields.length > 0 && changed_type_fields.length > 0) { + msg = `Deleting fields and changing field types will also delete existing annotations that belonged to them. + Do you want to continue?`; + } + else if (changed_type_fields.length > 0) { + msg = `Changing field types will also delete existing annotations that belonged to them. + Do you want to continue?`; + } + else if (deleted_fields.length > 0) { + msg = `Deleting fields will also delete existing annotations that belonged to them. + Do you want to continue?`; + } + popup.confirm(msg, "Confirm", () => { + annotations.saveAnnotationFields(new_fields); + }); + } + else { + annotations.saveAnnotationFields(new_fields); + } + }, + + saveAnnotations: function (){ // Write the annotations to the dataset and annotations table. - // First we're gonna collect the data for this page. - // Loop through each post's annotation field. - var anns = {}; - var dataset_key = $("#dataset-key").text(); + // First we're going to collect the data for this page. + // Loop through each post's annotation fields. + let anns = []; + let dataset_key = $("#dataset-key").text(); $(".posts > li").each(function(){ - let post_id = this.id.split("-")[1]; - let vals_changed = false; let post_annotations = $(this).find(".post-annotations"); if (post_annotations.length > 0) { - let post_vals = {}; post_annotations.find(".post-annotation").each(function(){ - let label = $(this).find(".annotation-label").text(); - let annotation_type = $(this).attr("class").split(" ").pop(); - let val = ""; - let edited = false - - if (annotation_type == "text" || annotation_type == "textarea") { - val = $(this).find(".post-annotation-input").val(); - // It can be the case that the input text is deleted - // In this case we *do* want to push new data, so we check - // whether there's an 'edited' class present and save if so. - if ($(this).find(".post-annotation-input").hasClass("edited")) { - edited = true - } - } - else if (annotation_type == "dropdown") { - let selected = $(this).find(".post-annotation-options").val(); - val = selected; - } - else if (annotation_type == "checkbox") { - val = []; - $(this).find(".post-annotation-options > input").each(function(){ - if ($(this).is(":checked")) { - val.push($(this).val()); - } - if ($(this).hasClass("edited")) { - edited = true - } - }); - if (!val.length > 0) { - val = undefined; + // Extract annotation object from edited elements + if ($(this).hasClass("edited")) { + let annotation = annotations.parseAnnotation($(this)); + if (Object.keys(annotation).length > 0 ) { + anns.push(annotation); } } - if ((val != undefined && val != "") || edited) { - vals_changed = true; - post_vals[label] = val; - } }); - - if (vals_changed){ - anns[post_id] = post_vals; - } } - }); + }) - - $("#save-annotations").html(" Saving annotations") - annotations.disableSaving(); + let save_annotations = $("#save-annotations"); + save_annotations.html(" Saving annotations") - let code = "" $.ajax({ url: getRelativeURL("explorer/save_annotations/" + dataset_key), type: "POST", @@ -689,86 +512,45 @@ const annotations = { data: JSON.stringify(anns), success: function (response) { - - if (response == 'success') { - code = response - - annotations.enableSaving(); - $("#save-annotations").html(" Annotations saved"); - $("#save-annotations").addClass("invalid").prop("disabled", true); - old_annotation_fields = $("#annotation-fields").html(); - // alert(alert_message); - } - else { - annotations.enableSaving(); - $("#save-annotations").html(" Save annotations"); - alert("Could't save annotations"); - console.log(response); - } + save_annotations.html(" Save annotations"); + annotations.notifySaved(); }, error: function (error) { - annotations.enableSaving(); - $("#save-annotations").html(" Save annotations"); - alert("Could't save annotations"); console.log(error) - } - }); - }, - - writeAnnotations: function () { - // Write the annotations to the dataset. - var dataset_key = $("#dataset-key").text(); - - $.ajax({ - url: getRelativeURL("api/queue-processor/"), - method: "POST", - data: {"key": dataset_key, "processor": "write-annotations"}, - - success: function (response) { - console.log(response) - if (response == "success") { - annotations.disableSaving(); + if (error.status == 400) { + annotations.warnEditor(error.responseJSON.error); } else { - console.log(response) + annotations.warnEditor("Server error, couldn't save annotations.") } - }, - error: function (error) { - console.log(error) + save_annotations.html(" Save annotations"); } }); - window.open(getRelativeURL("results/" + dataset_key, "__blank")); }, fieldsExist: function(){ // Annotation fields are sent by the server // and saved in a script in the header. // So we just need to check whether they're there. - - if (Object.keys(annotation_fields).length < 1) { - $("#toggle-annotations").addClass("invalid"); - return false; - } - else { - $("#toggle-annotations").removeClass("invalid"); - return true; - } + return Object.keys(annotation_fields).length >= 1; }, - enableSaving: function(){ - // Enable saving annotations to the database - $("#save-annotations, #save-to-dataset").removeClass("invalid").removeAttr("disabled"); - $("#save-annotations").html(" Save annotations"); - }, - - disableSaving: function(){ - // Disable saving annotations to the database - $("#save-annotations, #save-to-dataset").addClass("invalid").prop("disabled", true); + // Save annotations after x seconds if changes have been made + startSaveTimer: function() { + // Reset the save timer if it was already ongoing, + // so we're not making unnecessary calls when edits are still being made. + if (save_timer){ + clearTimeout(save_timer); + save_timer = null; + } + save_timer = setTimeout(function() { + annotations.saveAnnotations(); + }, 3000); }, warnEditor: function(warning) { - - let warn_field = $("#annotations-input-warning"); + // Warns the annotation field editor if stuff's wrong + let warn_field = $("#input-warning"); warn_field.html(warning); if (warn_field.hasClass("hidden")) { warn_field.removeClass("hidden"); @@ -776,105 +558,160 @@ const annotations = { } }, - toggleAnnotations: function() { - let ta = $("#toggle-annotations"); - if (ta.hasClass("hidden")) { - ta.removeClass("hidden"); - ta.html(" Hide annotations"); - $(".post-annotations").show(200); - } - else { - ta.addClass("hidden"); - ta.html(" Show annotations"); - $(".post-annotations").hide(200); + notifySaved: function() { + // Flash a fixed div with the notice that annotations are saved. + let notice = $("#save-annotations-notice"); + if (!notice.is(":visible")) { + notice.fadeIn(300); + notice.delay(1500).fadeOut(1000); } }, - getAnnotationsDiv: function(id){ - // Returns an input field element with a pseudo-random ID, if none is provided. - if (id == undefined || id == 0) { - id = annotations.randomInt(); - } - // Returns an annotation div element with a pseudo-random ID - return `
- - - -
`.replace("{{FIELD_ID}}", id); + showAnnotations: function() { + + // Change button + let ta = $("#toggle-annotations"); + ta.addClass("shown"); + ta.html(" Hide annotations"); + + // Store state in URL params + let queryParams = new URLSearchParams(window.location.search); + queryParams.set("show", "true"); + history.replaceState(null, null, "?"+queryParams.toString()); + + // Show/hide annotations div. Bit convoluted, but necessary to have auto height. + let pa = $(".post-annotations"); + let current_height = pa.height(); + let auto_height = pa.css("height", "auto").height(); + pa.height(current_height).animate({"height": auto_height}, 250, function(){ + pa.height("auto"); + }); + }, + + hideAnnotations: function() { + + // Store state in URL params + let queryParams = new URLSearchParams(window.location.search); + queryParams.delete("show"); + history.replaceState(null, null, "?"+queryParams.toString()); + + let ta = $("#toggle-annotations"); + ta.removeClass("shown"); + ta.html(" Show annotations"); + let pa = $(".post-annotations"); + pa.animate({"height": 0}, 250); + }, + + addAnnotationField: function(){ + /* + Adds an annotation field input element; + these have no IDs yet, we'll add a hashed database-label string when saving. + */ + + let annotation_field = ` +
  • + + + + + + + + + +
  • + `.replace("randomint", Math.floor(Math.random() * 100000000).toString()); + $("#annotation-field-settings").append(annotation_field); }, getInputField: function(id){ - // Returns an input field element with a pseudo-random ID, if none is provided. - if (id == undefined || id == 0) { - id = annotations.randomInt(); + // Returns an option field element with a pseudo-random ID, if none is provided. + if (id === undefined || id === 0) { + id = Math.floor(Math.random() * 100000000).toString(); } - return "
    "; + return ""; }, - randomInt: function(){ - return Math.floor(Math.random() * 100000000); + markChanges: function(el) { + // Adds info on edits on post annotation to its element, so we can save these to the db later. + // Currently includes the time of edits and the username of the annotator. + let current_username = $("#current-username").html(); + let current_date = Date.now() / 1000; + $(el).addClass("edited"); + $(el).find(".annotation-author").html(current_username); + $(el).find(".epoch-timestamp-edited").html(current_date); + $(el).find(".timestamp-edited").html(getLocalTimeStr(current_date)); + annotations.startSaveTimer(); } }; const page_functions = { init: function() { - document.querySelectorAll('.quote a').forEach(link => link.addEventListener('mouseover', function(e) { + document.querySelectorAll('.quote a').forEach(link => link.addEventListener('mouseover', function() { let post = 'post-' + this.getAttribute('href').split('-').pop(); document.querySelector('#' + post).classList.add('highlight'); })); - document.querySelectorAll('.quote a').forEach(link => link.addEventListener('mouseout', function(e) { + document.querySelectorAll('.quote a').forEach(link => link.addEventListener('mouseout', function() { document.querySelectorAll('.thread li').forEach(link => link.classList.remove('highlight')); })); - // Reorder the dataset when the sort type is changed - $("#sort-select").on("change", function(){ - - let selected = $(this).find("option:selected"); - - // Pass whether the order should be reversed or not - let sort_order = selected.data("desc"); - if (sort_order){ - sort_order = "&desc=true" - } - else { - sort_order = "" + // Change timestamps to the client's timezone + document.querySelectorAll(".timestamp-to-convert").forEach(function(el){ + el.innerText = getLocalTimeStr(el.innerText); + }); + + // Make annotation field editor sortable with jQuery UI. + $('#annotation-field-settings').sortable({ + cursor: "s-resize", + handle: ".handle", + items: "li", + axis: "y", + change: function() { + } + }); + + // Reorder the dataset when the sort type is changed + $(".sort-select").on("change", function(){ - // Pass whether we should treat this value as an integer - let force_int = selected.data("force-int"); - if (force_int){ - force_int = "&int=true" + // Get the column to sort on, and whether we should sort in reverse. + let selected = $("#column-sort-select").find("option:selected").val(); + let order = $("#column-sort-order").find("option:selected").val(); + + let queryParams = new URLSearchParams(window.location.search); + let dataset_key = $("#dataset-key").text(); + queryParams.set("sort", selected) + if (order === "reverse"){ + queryParams.set("order", "reverse"); } else { - force_int = "" + queryParams.delete("order"); } - - window.location.href = getRelativeURL('explorer/dataset/' + $("#dataset-key").text() + "?sort=" + $(this).val() + sort_order + force_int); + window.location.href = getRelativeURL("results/" + dataset_key + "/explorer/?" + queryParams.toString()); }); - // Change the dropdown sort option based on the URL parameter + // Show annotations if it's in the URL params, + // and change the dropdown sort option based on the sort parameter. let searchParams = new URLSearchParams(window.location.search) - let sort_order = searchParams.get("sort"); - let desc = searchParams.get("desc"); - - if (sort_order) { - // There can be multiple options with the same key since - // one of them might be reversed and the other not (e.g. - // timestamps sorted by new to old and vice versa). - // So select the sort order with the right desc attribute. - if (desc == "true") { - $("#sort-select").find("option[value='" + sort_order + "'][data-desc='True']").attr("selected", "selected"); - } - else { - $("#sort-select").val(sort_order); + let show_annotations = searchParams.get("show"); + // Never show annotations if there's no annotation fields + if (annotation_fields) { + if (show_annotations) { + annotations.showAnnotations(); } } + let selected = searchParams.get("sort"); + let sort_order = searchParams.get("order"); + $("#column-sort-select").find("option[value='" + selected + "']").attr("selected", "selected"); + if (sort_order) { + $("#column-sort-order").find("option[value='" + sort_order + "']").attr("selected", "selected"); + } } }; @@ -894,5 +731,10 @@ function getRelativeURL(endpoint) { return root + endpoint; } +function getLocalTimeStr(epoch_timestamp) { + let local_date = new Date(parseInt(epoch_timestamp) * 1000) + local_date = Intl.DateTimeFormat("en-GB", {dateStyle: "medium", timeStyle: "medium"}).format(local_date); + return local_date +} }); \ No newline at end of file diff --git a/webtool/static/js/fourcat.js b/webtool/static/js/fourcat.js index 950ba523e..b3ebeb6c4 100644 --- a/webtool/static/js/fourcat.js +++ b/webtool/static/js/fourcat.js @@ -1653,7 +1653,6 @@ const ui_helpers = { } }, - /** * Ask for confirmation before doing whatever happens when the event goes through * diff --git a/webtool/templates/components/datasource-option.html b/webtool/templates/components/datasource-option.html index 4ee4ba16e..2b85e3097 100644 --- a/webtool/templates/components/datasource-option.html +++ b/webtool/templates/components/datasource-option.html @@ -147,6 +147,56 @@ + {% elif settings.type == "datasources_table" %} + {% set tooltips = {} %} +
    + + + + + {% for column_id, column in settings.columns.items() %} + + {% endfor %} + + + + {% for datasource, datasource_config in datasources_config.items() %} + {% if datasource_config.enabled %} + + + {% for column_id, column in settings.columns.items() %} + + {% endfor %} + + {% endif %} + {% endfor %} + +
    Enabled data sources{{ column.help }} + {% if column.tooltip %} + + {% set x = tooltips.__setitem__(column_id, column.tooltip) %} + {% endif %} +
    {{ datasource_config.name }} + {% set column_value = "" %} + {% if datasource in settings.default and settings.default[datasource][column_id] %} + {% set column_value = settings.default[datasource][column_id] %} + {% endif %} + {% if column.type == "string" %} + + {% elif column.type == "toggle" %} + + {% elif column.type == "choice" %} + + {% endif %} +
    + {% for tooltip, tooltip_text in tooltips.items() %} + + {% endfor %} +
    {% endif %} diff --git a/webtool/templates/components/result-child.html b/webtool/templates/components/result-child.html index 36fb64136..df7da65dd 100644 --- a/webtool/templates/components/result-child.html +++ b/webtool/templates/components/result-child.html @@ -10,14 +10,14 @@ diff --git a/webtool/templates/components/result-details.html b/webtool/templates/components/result-details.html index 145769e4e..de924e4b9 100644 --- a/webtool/templates/components/result-details.html +++ b/webtool/templates/components/result-details.html @@ -110,6 +110,22 @@

    {% include 'components/result-metadata.html' %} + {% set annotation_fields = dataset.get_annotation_fields() %} + {% if annotation_fields %} +
    +
    Annotations
    +
    + {% set annotations = dataset.get_annotations() %} + {% if annotations %} + {{ annotations|length|numberify }} annotation{% if annotations|length > 1 %}s{% endif %} + {% endif %} + {% for annotation_field in annotation_fields.items() %} + {{ annotation_field[1].label }} + {% endfor %} +
    +
    + {% endif %} + {% if has_credentials and current_user.is_authenticated and (__user_config("privileges.admin.can_manipulate_all_datasets") or dataset.is_accessible_by(current_user, "owner")) %}
    API Credentials
    diff --git a/webtool/templates/components/result-result-row.html b/webtool/templates/components/result-result-row.html index 665ed5fd0..04b11d8c8 100644 --- a/webtool/templates/components/result-result-row.html +++ b/webtool/templates/components/result-result-row.html @@ -17,22 +17,22 @@
    {% if dataset.is_finished() and dataset.num_rows > 0 %}
      -
    • - - - {{ dataset.result_file.split(".")[-1] }} ({{ dataset.get_results_path()|filesize }}) - - -
    • - {% if dataset.get_extension() != "csv" and dataset.get_own_processor().map_item %} + {% if dataset.get_own_processor().map_item or dataset.get_annotation_fields() %}
    • - csv + Download csv - +
    • {% endif %} +
    • + + + Original {{ dataset.get_extension() }} ({{ dataset.get_results_path()|filesize }}) + + +
    • {% if (dataset.get_extension() in ("csv", "gexf", "html") or dataset.get_own_processor().map_item) and not __user_config("ui.inline_preview") %}
    • {% if __user_config("privileges.can_use_explorer") %} - - Explore + + Explore & annotate {% endif %} diff --git a/webtool/templates/controlpanel/config.html b/webtool/templates/controlpanel/config.html index ed1108c2d..43697a223 100644 --- a/webtool/templates/controlpanel/config.html +++ b/webtool/templates/controlpanel/config.html @@ -74,6 +74,7 @@

      {{ {"core": "4CAT Core", "datasources": "Data sources", "processors": "Proce {% else %} {% set settings = options[option] %} {% endif %} + {% include 'components/datasource-option.html' %} {% endfor %} diff --git a/webtool/templates/explorer/annotations-editor.html b/webtool/templates/explorer/annotations-editor.html new file mode 100644 index 000000000..224469705 --- /dev/null +++ b/webtool/templates/explorer/annotations-editor.html @@ -0,0 +1,49 @@ + + + +
      +
        + {% if annotation_fields %} + {% for field in annotation_fields %} + {% set annotation_field = annotation_fields[field] %} + {% set annotation_type = annotation_field["type"] %} + {% set label = annotation_field["label"] %} +
      1. + + + + + + + + + + {% if annotation_type == "dropdown" or annotation_type == "checkbox" %} + {% for option_id, option_label in annotation_fields[field]["options"].items() %} + + + + + {% endfor %} + + + + {% endif %} + + +
      2. + {% endfor %} + {% endif %} +
      + +
      diff --git a/webtool/templates/explorer/annotations.html b/webtool/templates/explorer/annotations.html deleted file mode 100644 index fbb0b89bb..000000000 --- a/webtool/templates/explorer/annotations.html +++ /dev/null @@ -1,69 +0,0 @@ -
      -
      - -
      ×
      - -
      -
      -
      Label
      -
      Input type
      -
      Options
      -
      - -
      - {% if annotation_fields %} - - {% for field in annotation_fields %} - - {% set annotation_type = annotation_fields[field]["type"] %} - {% set label = annotation_fields[field]["label"] %} -
      - - - - - {% if annotation_type == "dropdown" or annotation_type == "checkbox" %} -
      - {% for option in annotation_fields[field]["options"] %} - {% set option_id = option.keys() | first %} - {% set option_label = option.values() | first %} -
      - - -
      - {% endfor %} -
      - - -
      -
      - {% endif %} -
      - {% endfor %} - {% endif %} -
      -
      -
      - - - -

      Note: Changing input types will overwrite existing annotations for the field

      -
      -
      -
      - -
      -
      - - - | - - -
      - -
      \ No newline at end of file diff --git a/webtool/templates/explorer/controls.html b/webtool/templates/explorer/controls.html new file mode 100644 index 000000000..421ab3708 --- /dev/null +++ b/webtool/templates/explorer/controls.html @@ -0,0 +1,69 @@ +
      +
      +

      + {{ dataset.get_label() }} - Explorer +

      + + +
      + +
      +
      +
      diff --git a/webtool/templates/explorer/datasource-templates/generic.html b/webtool/templates/explorer/datasource-templates/generic.html new file mode 100644 index 000000000..ccebaf09c --- /dev/null +++ b/webtool/templates/explorer/datasource-templates/generic.html @@ -0,0 +1,127 @@ + +{% +set all_known_fields = { + "author": ["author", "author_name", "author_fullname", "nickname"], + "time": ["created_utc", "timestamp", "time"], + "title": ["title", "subject"], + "body": ["body", "message"], + "media": ["image", "images", "image_url"], + "tags": ["hashtags", "tags"], + "views": ["views", "num_views"], + "likes": ["likes", "num_likes", "notes"], + "comments": ["num_comments", "reactions"], + "shares": ["shares", "num_shares"], + "url": ["url", "link_url", "post_url", "link"] +} +%} + +{% set fields = {} %} + + +{% set x=fields.__setitem__("id", post["id"]) %} +{% set x=fields.__setitem__("thread_id", post["thread_id"]) %} + + +{% for field, known_fields in all_known_fields.items() %} + {% for known_field in known_fields %} + {% if known_field in post and post[known_field] %} + {% set x=fields.__setitem__(field, post[known_field]) %} + {% endif %} + {% endfor %} +{% endfor %} + + +{% if fields.get("url") and pseudonymised %} + +{% elif fields.get("url") and not pseudonymised %} + +{% endif %} + + +
      + {{ fields.get("id") }} + {{ fields.get("thread_id") }} + + + {% if pseudonymised %} + + + {% else %} + + {{ fields.get("author") }} + {% endif %} + + + + {% if "title" in post and post["title"] %} + {{post.title}} + {% endif %} + + + {% if fields.get("time") is integer %} + {{ fields.get("time")|datetime('%Y-%m-%d %H:%M')|safe }} + {% else %} + {{ fields.get("time") }} + {% endif %} + +
      + + +
      + + + {% if fields.get("media") %} + + {% if "," in fields["media"] %} + {% set media_urls = fields["media"].split(",") %} + {% else %} + {% set media_urls = [fields.get("media")] %} + {% endif %} +
      + {% for media_url in media_urls %} + + + + {% endfor %} +
      + {% endif %} + + +
      + {{ fields.body | safe }} +
      + + + {% if fields.get("tags") %} +
      + {{ fields.tags | safe }} +
      + {% endif %} + + +
      + {% if fields.get("views") %} + + {{ fields.views | commafy }} + + {% endif %} + {% if fields.get("likes") %} + + {% endif %} + {% if fields.get("shares") %} + + {{ fields.shares | commafy }} + + {% endif %} + {% if fields.get("comments") %} + + {{ fields.comments }} + + {% endif %} +
      + +
      diff --git a/webtool/templates/explorer/datasource-templates/instagram.html b/webtool/templates/explorer/datasource-templates/instagram.html new file mode 100644 index 000000000..b4b3d677b --- /dev/null +++ b/webtool/templates/explorer/datasource-templates/instagram.html @@ -0,0 +1,47 @@ +
      + + {% if not pseudonymised %} + + + + {{ post.get("author") }} + {% if post.get("is_verified") %} {% endif %} + {% if post.get("coauthor") %} + and {{ post.get("coauthor") }} + {% endif %} + {{ post.get("timestamp") }} + {% else %} + + + {% endif %} + {% if post.get("location_name") %} + + {% endif %} + +
      + + +
      + {% if post.num_likes %} + + {% endif %} +
      {{ post.get("body") | social_mediafy(datasource='instagram') | safe }}
      + {% if post.num_comments %} + {% if pseudonymised %} +
      {{ post.get("num_comments") | commafy }} comments + {% else %} +
      \ No newline at end of file diff --git a/webtool/templates/explorer/datasource-templates/linkedin.html b/webtool/templates/explorer/datasource-templates/linkedin.html new file mode 100644 index 000000000..68aa70b85 --- /dev/null +++ b/webtool/templates/explorer/datasource-templates/linkedin.html @@ -0,0 +1,73 @@ +
      +
      + {% if post.inclusion_context %} +
      {{ post.inclusion_context }}
      + {% endif %} + + +
      + + +
      {{ post.get("body") | social_mediafy(datasource='linkedin') | safe }}
      + + + {% if post["image_urls"] or post["video_thumb_url"] %} +
      + {% if post["image_urls"] %} + {% for image_url in post["image_urls"].split(",") %} + + {% endfor %} + {% elif post["video_thumb_url"] %} +
      +
      + {% endif %} +
      + {% endif %} + + + +
      + {% for reaction_type in ["reaction_like","reaction_empathy","reaction_praise","reaction_entertainment","reaction_appreciation","reaction_interest"] %} + {% if reaction_type in post and post[reaction_type] > 0 %} + {{ post[reaction_type] }} + {% endif %} + {% endfor %} + + {% if post.get("shares") and post["shares"] > 0 %} + {{ post.get("shares") | numberify }} reposts + {% endif %} + {% if (post.get("shares") and post["shares"] > 0) and (post.get("comments") and post["comments"] > 0) %} • {% endif %} + {% if post.get("comments") and post["comments"] > 0 %} + {{ post.get("comments") | numberify }} comments + {% endif %} + +
      + +
      \ No newline at end of file diff --git a/webtool/templates/explorer/datasource-templates/telegram.html b/webtool/templates/explorer/datasource-templates/telegram.html new file mode 100644 index 000000000..f3b8dd9e4 --- /dev/null +++ b/webtool/templates/explorer/datasource-templates/telegram.html @@ -0,0 +1,91 @@ + + + {% set day = post.unix_timestamp | datetime(fmt="%d %B", wrap=False) %} + {% set prev_post = posts[post_index - 1] if post_index > 0 else {} %} + {% set new_day = day if not prev_post or prev_post.get("unix_timestamp", 0) | datetime(fmt="%d %B", wrap=False) != day else False %} + {% set new_author = True if not prev_post or prev_post.author != post.author else False %} + + {% if new_day %} +
      + {{ new_day }} +
      + {% endif %} + +
      +
      + {% if new_author or new_day %} +
      + + {% set author = post.author_username if not post.author_name else post.author_name %} + {% if not pseudonymised and author %} + + {% for name in author.split()[:2] %}{{ name[0] }}{% endfor %} + {% else %} + + {% endif %} + +
      + + + + + + + + + + + +
      +
      +
      + {% if not pseudonymised %} + {{ author }} + {% else %} + + {% endif %} +
      + {% else %} +
      +
      + {% endif %} + {% if post.attachment_type %} +
      + + + +
      + {% endif %} +
      + {% if post.reply_to %} + + {% endif %} + {{ post.body_markdown | markdown | social_mediafy(datasource="telegram") | safe }} +
      + + {% if post.reactions %} +
      + {% set reactions = post.reactions|string_counter(emoji=True) %} + {% for reaction, count in reactions.items() %} + {{ reaction }}{{ count }} + {% endfor %} +
      + {% endif %} +
      + {% if post.views %} {{ post.views|numberify }}{% endif %} + {% if post.forwards %} {{ post.forwards|numberify }}{% endif %} + + {{ post.unix_timestamp | datetime(fmt="%H:%M", wrap=False) }} UTC + + {% if post.unix_timestamp_edited %} + + | edited {{ post.unix_timestamp_edited | datetime(fmt="%H:%M", wrap=False) }} UTC + + {% endif %} + {% if not pseudonymised %} + + {% endif %} +
      +
      +
      \ No newline at end of file diff --git a/webtool/templates/explorer/datasource-templates/tiktok.html b/webtool/templates/explorer/datasource-templates/tiktok.html new file mode 100644 index 000000000..e89ef13bc --- /dev/null +++ b/webtool/templates/explorer/datasource-templates/tiktok.html @@ -0,0 +1,72 @@ +
      +
      +
      + {% if not pseudonymised %} + + {% else %} + + {% endif %} +
      + +
      + +
      + + {% if pseudonymised %} + + + {% else %} + + {{ post.get("author") }} {{ post.get("author_full" )}} + {% endif %} + + + {{ post.get("timestamp") }} + +
      {{ post.get("music_name") }}
      + +
      + + + {{ post.body | social_mediafy(datasource="tiktok") | safe }} + + +
      + +
      + +
      + + +
      + {% if post.get("plays") %} + + {{ post["plays"] | numberify }} + + {% endif %} + {% if post.get("likes") %} + + {% endif %} + {% if post.get("shares") %} + + {{ post["shares"] | numberify }} + + {% endif %} + {% if post.get("comments") %} + + {{ post["comments"] | numberify }} + + {% endif %} + + {% if post.get("tiktok_url") and pseudonymised %} + + {% elif post.get("tiktok_url") and not pseudonymised %} + + {% endif %} +
      +
      +
      + + diff --git a/webtool/templates/explorer/datasource-templates/tumblr.html b/webtool/templates/explorer/datasource-templates/tumblr.html new file mode 100644 index 000000000..65ad402ac --- /dev/null +++ b/webtool/templates/explorer/datasource-templates/tumblr.html @@ -0,0 +1,221 @@ +{% set reblog = True if post.parent_post_author else False %} +
      +
      + {% if not pseudonymised %} + + + + {% if post["author_avatar_url"] %} +
      + + + +
      + {% endif %} + + {% if reblog %} + + {% endif %} + {{ post.get("author") }} + {% if reblog %} reblogged {{ post.parent_post_author }}{% endif %} + + {% else %} + + + {% if post.parent_post_author %} reblogged{% endif %} + {% endif %} +
      + +
      + + +{% if reblog %} + + {% for reblog_author in post.reblog_trail.split(",") %} +
      + {% if not pseudonymised %} +
      + + + +
      + {{ reblog_author }} + {% else %} + + {% endif %} +
      +
      + {% if post.get("image_urls_reblogged") %} + {% for image_url in post.image_urls_reblogged.split(",") %} +
      + +
      + {% endfor %} + {% endif %} +
      + {{ post.body_reblogged.split("\n\n")[loop.index - 1] }} +
      +
      + {% endfor %} + + {% if post.body %} +
      + {% if not pseudonymised %} + + + + {% if post["author_avatar_url"] %} +
      + + + +
      + {% endif %} + + {{ post.get("author") }} + + {% else %} + + + {% endif %} +
      + {% endif %} + +{% endif %} + +
      + + + + {% set block_counts = namespace({'text': 0, 'image': 0, 'video': 0, 'audio': 0, 'link': 0, 'ask': 0}) %} + {% set content_order = post.content_order.split(",") %} + {% for block in content_order %} + {% if block == "text" %} + +
      {{ post.get("body_markdown").split("\n")[block_counts.text] | markdown | social_mediafy(datasource='tumblr') | safe }}
      + {% set block_counts.text = block_counts.text + 1 %} + {% elif block == "image" %} + +
      + +
      + {% set block_counts.image = block_counts.image + 1 %} + {% elif block == "video" %} + + + {% set block_counts.video = block_counts.video + 1 %} + {% elif block == "audio" %} + +
      + +
      + {% set block_counts.audio = block_counts.audio + 1 %} + {% elif block == "link" %} + + {% set url = post.link_urls.split(",")[block_counts.link] %} + {% set link_title = post.link_titles.split(",")[block_counts.link] %} + {% set link_description = post.link_descriptions.split(",")[block_counts.link] %} + + + + {% set block_counts.link = block_counts.link + 1 %} + {% elif block == "poll" %} + + +
      +
      {{ post["poll_question"] }}
      +
        + {% for poll_answer in post["poll_answers"].split(",") %} +
      • {{ poll_answer }}
      • + {% endfor %} +
      +
      + + {% elif block == "ask" %} + {% set start_ask_block = True if loop.index == 0 or content_order[loop.index - 2] != "ask" else False %} + {% set end_ask_block = True if loop.index == content_order|length or content_order[loop.index] != "ask" else False %} + {% if start_ask_block %} +
      +
      +
      {% if not pseudonymised %}{{ post["author_ask"] }}{% else %}{% endif %} asked:
      + {% endif %} +

      {{ post.get("body_ask").split("\n")[block_counts.ask] | markdown | social_mediafy(datasource='tumblr') | safe }}

      + {% if end_ask_block %} +
      +
      +
      + {% if not pseudonymised %} + + {% endif %} +
      + {% endif %} + {% set block_counts.ask = block_counts.ask + 1 %} + {% endif %} + {% endfor %} + + + {% if post.get("tags") %} +
      +
        + {% for tag in post["tags"].split(",") %} +
      • #{{ tag }}
      • + {% endfor %} +
      +
      + {% endif %} +
      + +
      + + +
      {{ post.unix_timestamp | datetime(fmt="%d %b %Y, %H:%M", wrap=False) }} UTC
      + + + {% if post.note_count %} +
      +
      + {{ post.get("note_count") | commafy }} note{% if post.get("note_count", 0) > 1 %}s{% endif %} + + {% if post.get("reblog_count") %} + {{ post.reblog_count | commafy }} + {% endif %} + + {% if post.get("like_count") %} + + {% endif %} + + {% if post.get("reply_count") %} + {{ post.get("reply_count") | commafy }} + {% endif %} +
      + {% if post.get("authors_replied") %} +
      + {% for author_replied in post.get("authors_replied").split(",") %} +
    • +
      + {% if not pseudonymised %} + + {% else %} + + + {% endif %} +
      +
      +
      {% if not pseudonymised %}{{ author_replied }}{% else %}{% endif %}
      +
      {{ post.replies.split("\n\n")[ loop.index - 1 ].replace(author_replied + ": ", "") | social_mediafy(datasource='tumblr') | safe }}
      +
      +
    • + {% endfor %} +
      + {% endif %} +
      + {% endif %} +
      \ No newline at end of file diff --git a/webtool/templates/explorer/datasource-templates/twitter.html b/webtool/templates/explorer/datasource-templates/twitter.html new file mode 100644 index 000000000..fa238674a --- /dev/null +++ b/webtool/templates/explorer/datasource-templates/twitter.html @@ -0,0 +1,82 @@ +
      +
      +
      + {% if not pseudonymised %} + + {% else %} + + {% endif %} +
      +
      +
      + {% if not pseudonymised %} + {{ post.get("author_fullname")}} {% if post.get("verified") %} {% endif %}@{{ post.get("author") }} + + {% else %} + + {% endif %} + {{ post.get("timestamp") }} + +
      + +
      {{ post.body | social_mediafy(datasource='twitter') | safe }}
      + + + {% if post.get("images") %} + {% set media_url = post.get("images") %} + {% elif post.get("videos") %} + {% set media_url = post.get("videos") %} + {% endif %} + {% if media_url %} +
      + {% for url in media_url.split(",") %} + + {% endfor %} +
      + {% endif %} + + + {% if post.get("quote_author") %} +
      +
      + + {% if not pseudonymised %} + @{{ post.get("quote_author") }} + {% else %} + + {% endif %} +
      + + {% if post.quote_body %} +
      + {{ post.quote_body | social_mediafy(datasource='twitter') | safe }} +
      + {% endif %} + + {% if post.get("quote_images") %} + {% set media_url = post.get("quote_images") %} + {% elif post.get("quote_videos") %} + {% set media_url = post.get("quote_videos") %} + {% endif %} + {% if media_url %} +
      + {% for url in media_url.split(",") %} + + {% endfor %} +
      + {% endif %} +
      + {% endif %} +
      + {{ post.get("reply_count") | numberify }} + {{ post.get("retweet_count") }} + + {% if post.get("impression_count") %} {{ post.get("impression_count") | numberify }}{% endif %} + + {% if not pseudonymised %} + + {% endif %} +
      +
      +
      +
      \ No newline at end of file diff --git a/webtool/templates/explorer/explorer.html b/webtool/templates/explorer/explorer.html index 9a7251990..a19aa23c4 100644 --- a/webtool/templates/explorer/explorer.html +++ b/webtool/templates/explorer/explorer.html @@ -1,57 +1,72 @@ - - - 4CAT Explorer • {% if parameters and parameters.get("label") %}{{ parameters.get("label") }}{% elif key %}{{ key }}{% elif thread %}{{ thread }}{% endif %} +{% extends "layout.html" %} - +{% block title %}Explorer: {{ dataset.get_label() }} • 4CAT{% endblock %} +{% block breadcrumbs %}{% set navigation.current = "explorer" %}{% endblock %} + +{% block body %} + + + + + + + + + + +{% set pseudonymised = True if dataset.parameters and dataset.parameters.get('pseudonymise', False) %} + +{% set key = dataset.data.key %} + + +{% include "explorer/controls.html" %} + + +{% include "explorer/pagination.html" %} + + + + + + +{% if datasource == '4chan' %} - - - - - - - - - - - - - - - - - - - {% if custom_css %} - - {% endif %} - - - -
      - {% include "explorer/header.html" %} - - {% if not thread %} - {% include "explorer/annotations.html" %} - {% endif %} -
      - -
      - {% include "explorer/nav-pages.html" %} -
        - {% for post in posts %} - {% include "explorer/post.html" %} - {% endfor %} -
      - {% include "explorer/nav-pages.html" %} +{% endif %} + + +
      +
      +
        + {% for post in posts %} + {% set post_index = loop.index - 1 %} + {% include "explorer/post.html" %} + {% endfor %} +
      +
      -
      - {% include "explorer/footer.html" %} -
      - \ No newline at end of file +{% include "explorer/pagination.html" %} + +
      Annotations saved
      +{% endblock %} \ No newline at end of file diff --git a/webtool/templates/explorer/footer.html b/webtool/templates/explorer/footer.html deleted file mode 100644 index 258a61e98..000000000 --- a/webtool/templates/explorer/footer.html +++ /dev/null @@ -1 +0,0 @@ -

      Rendered by 4CAT

      \ No newline at end of file diff --git a/webtool/templates/explorer/header.html b/webtool/templates/explorer/header.html deleted file mode 100644 index f700a10a4..000000000 --- a/webtool/templates/explorer/header.html +++ /dev/null @@ -1,54 +0,0 @@ -

      - - Return to dataset - - - - 4CAT Explorer (beta){% if parameters and parameters.get("label") %} • {{ parameters.get("label") }}{% elif thread %} • {{ thread }}{% endif %} - -

      -{{ key }} -
      - {% if key %} -
      - {% if post_count > max_posts %} -

      Large dataset - only showing the first {{ max_posts }} posts. Use filter processors to limit the dataset.

      - {% set post_count = max_posts %} - {% endif %} -

      Showing posts {{ offset + 1 }} - {{ post_count if (offset + limit) > post_count else (offset + limit) }} ({{ post_count }} in total).

      - {% if custom_fields and custom_fields[0] == "invalid" %} -

      Invalid custom fields JSON - can't show posts properly ({{ custom_fields[1] }}).

      - {% endif %} - {% if custom_fields and 'sort_options' in custom_fields %} -
      -

      Sort posts by: - -

      -
      - {% endif %} -
      - - {% elif thread %} -

      Showing {{ post_count }} posts from {{ datasource }}/{{ board }} thread {{ thread }}.

      -

      Note that the archived posts may not be complete.

      - {% endif %} - -
      \ No newline at end of file diff --git a/webtool/templates/explorer/nav-pages.html b/webtool/templates/explorer/nav-pages.html deleted file mode 100644 index 23fe84f8d..000000000 --- a/webtool/templates/explorer/nav-pages.html +++ /dev/null @@ -1,61 +0,0 @@ - \ No newline at end of file diff --git a/webtool/templates/explorer/pagination.html b/webtool/templates/explorer/pagination.html new file mode 100644 index 000000000..1dbb4f05e --- /dev/null +++ b/webtool/templates/explorer/pagination.html @@ -0,0 +1,64 @@ + \ No newline at end of file diff --git a/webtool/templates/explorer/post-annotations.html b/webtool/templates/explorer/post-annotations.html new file mode 100644 index 000000000..02e208bcb --- /dev/null +++ b/webtool/templates/explorer/post-annotations.html @@ -0,0 +1,86 @@ +
      + + {% if annotation_fields %} + {% if annotations and post.id in annotations %} + {% set post_annotations = annotations[post.id] %} + {% endif %} + + {% for field in annotation_fields %} + + {% set type = annotation_fields[field]["type"] %} + {% set label = annotation_fields[field]["label"] %} + + {# Loop through annotations for this post and + retrieve the data from the one matching this annotation field #} + {% set an = namespace(an={}) %} + {% for post_annotation in post_annotations %} + {% if post_annotation.field_id == field %} + {% set an.an = post_annotation %} + {% endif %} + {% endfor %} + {% set annotation = an.an %} + +
      + + {% if type == "text" %} + + + {% elif type == "textarea" %} + + + {% elif type == "dropdown" %} + + + {% elif type == "checkbox" %} +
      + {% for option_id, option_label in annotation_fields[field]["options"].items() %} + {% set checked = "checked" if option_label in annotation.value else "" %} + + {% endfor %} +
      + {% endif %} + + {# Tooltip with metadata on the annotation #} + {% if annotation.author or annotation.author_original or annotation.timestamp or annotation.metadata %} + + + {% endif %} + + {# Store some invisible data here to we can retrieve in with JS #} + +
      + {% endfor %} + {% endif %} +
      \ No newline at end of file diff --git a/webtool/templates/explorer/post.html b/webtool/templates/explorer/post.html index ac6827fc9..61074757b 100644 --- a/webtool/templates/explorer/post.html +++ b/webtool/templates/explorer/post.html @@ -1,174 +1,16 @@
    • - - {% set special_fields = ["external_url", "author", "body", "timestamp", "image", "images", "sort_options", "markdown"] %} - {% set pseudonymised = True if parameters and ('pseudonymise' in parameters and parameters.get('pseudonymise')) else False %} + + {% if template == "datasource" %} + {% include "explorer/datasource-templates/" + datasource + ".html" %} - - {% if custom_fields and "external_url" in custom_fields %} - {% set external_url = custom_fields.external_url | post_field(post) %} - {% if external_url and pseudonymised %} - - {% elif external_url and not pseudonymised %} - - {% endif %} - {% endif %} - -
      - - {% if pseudonymised %} - - - {% else %} - - {% if custom_fields and custom_fields.get("author") %} - {% set author = custom_fields.author | post_field(post) | safe %} - {% else %} - {% set author = post.author %} - {% endif %} - {{ author }} - - {% endif %} - - {% if 'thread_id' in post %} - {% if is_local %}{{ post.thread_id }}{% else %}{{ post.thread_id }}{% endif %} - {% endif %} - - {% if 'timestamp' in post %} - {% if post.timestamp is integer %} - {{ post.timestamp|datetime('%Y-%m-%d %H:%M')|safe }} - {% else %} - {{ post.timestamp }} - {% endif %} - {% endif %} - - - {% if custom_fields %} - {% for custom_field in custom_fields %} - - {% if custom_field not in special_fields %} - - {% set custom_value = custom_fields[custom_field] | post_field(post) | safe %} - {% if custom_value and custom_value != "None" %} - {% if not ("author" in custom_field and pseudonymised) %} - {{ custom_value | safe }} - {% endif %} - {% endif %} - - {% endif %} - {% endfor %} - {% endif %} - -
      - - -
      - - {% if custom_fields and 'image' in custom_fields %} - {% set img_link = custom_fields['image'] | post_field(post) %} - - {% if 'not-found:' in img_link %} -
      - -
      - - {% elif 'retrieve:' in img_link %} - {% set img_link = img_link[9:] %} - -
      - -
      -
      - - {% elif img_link %} - -
      - -
      -
      - {% endif %} - - - {% elif custom_fields and 'images' in custom_fields %} - {% set img_links = custom_fields['images'] | post_field(post) %} - {% if img_links %} - {% set img_links = img_links.split(",") %} -
      - {% for img_link in img_links %} - -
      - -
      -
      - {% endfor %} -
      - {% endif %} - {% endif %} - - - {% if custom_fields and 'body' in custom_fields %} - {{ custom_fields.body | post_field(post) | safe }} - {% else %} - {{ post.body | safe }} - {% endif %} - - -
      - - -
      + + {% else %} + {% include "explorer/datasource-templates/generic.html" %} - {% if annotation_fields %} - {% set old_annotations = None %} - - {% if annotations and post.id in annotations %} - {% set old_annotations = annotations[post.id] %} - {% endif %} - - {% for field in annotation_fields %} - - {% set type = annotation_fields[field]["type"] %} - {% set label = annotation_fields[field]["label"] %} - {% set old_annotation = "" %} - - {% if old_annotations and label in old_annotations %} - {% set old_annotation = old_annotations[label] %} - {% endif %} - -
      - {% if type == 'text' %} - - - {% elif type == 'textarea' %} - - - {% elif type == 'dropdown' %} - - - {% elif type == 'checkbox' %} -
      - - {% for option in annotation_fields[field]["options"] %} - {% set option_id = option.keys() | first %} - {% set option_label = option.values() | first %} - {% set checked = "checked" if old_annotation and option_label in old_annotation else "" %} - - - {% endfor %} -
      - {% endif %} - -
      - {% endfor %} - {% endif %} -
      + {% endif %} + + {% include "explorer/post-annotations.html" %} +
    • diff --git a/webtool/templates/results.html b/webtool/templates/results.html index 63954220b..339254a83 100644 --- a/webtool/templates/results.html +++ b/webtool/templates/results.html @@ -65,7 +65,13 @@

      {{ dataset.get_label() }}

      {% if not dataset.is_finished() or dataset.num_rows == 0 %}

      {% include "components/result-status.html" %}

      {% else %} - {{ dataset.get_results_path()|filesize }}, {{ dataset.result_file.split(".")[-1] }} + {% if dataset.get_results_path().exists() %} + {% if dataset.get_own_processor().map_item or dataset.get_annotation_fields() %} + Download csv + {% else %} + Download {{ dataset.result_file.split(".")[-1] }} + {% endif %} + {% endif %} {% endif %}
      diff --git a/webtool/views/api_explorer.py b/webtool/views/api_explorer.py deleted file mode 100644 index ad89eeb12..000000000 --- a/webtool/views/api_explorer.py +++ /dev/null @@ -1,727 +0,0 @@ -""" -4CAT Data API - endpoints to get post and thread data from -""" - -import json -import csv -import re -import markdown2 - - -from pathlib import Path - -from flask import jsonify, abort, send_file, request, render_template -from flask_login import login_required, current_user - -from webtool import app, db, openapi, limiter, config, fourcat_modules -from webtool.lib.helpers import format_chan_post, error, setting_required -from common.lib.dataset import DataSet -from common.lib.helpers import strip_tags -from common.lib.exceptions import DataSetException - -from common.config_manager import ConfigWrapper -config = ConfigWrapper(config, user=current_user, request=request) -api_ratelimit = limiter.shared_limit("45 per minute", scope="api") - -@app.route('/explorer/dataset//', defaults={'page': 0}) -@app.route('/explorer/dataset//') -@api_ratelimit -@login_required -@setting_required("privileges.can_use_explorer") -@openapi.endpoint("explorer") -def explorer_dataset(key, page): - """ - Show posts from a specific dataset - - :param str dataset_key: Dataset key - - :return-schema: {type=array,items={type=integer}} - - :return-error 404: If the dataset does not exist. - """ - - # Get dataset info. - try: - dataset = DataSet(key=key, db=db, modules=fourcat_modules) - except DataSetException: - return error(404, error="Dataset not found.") - - if dataset.is_private and not (config.get("privileges.can_view_all_datasets") or dataset.is_accessible_by(current_user)): - return error(403, error="This dataset is private.") - - if len(dataset.get_genealogy()) > 1: - return error(404, error="Exporer only available for top-level datasets") - - results_path = dataset.check_dataset_finished() - if not results_path: - return error(404, error="This dataset didn't finish executing (yet)") - - # The amount of posts to show on a page - limit = config.get("explorer.posts_per_page", 50) - - # The amount of posts that may be included (limit for large datasets) - max_posts = config.get('explorer.max_posts', 500000) - - # The offset for posts depending on the current page - offset = ((page - 1) * limit) if page else 0 - - # Load some variables - parameters = dataset.get_parameters() - datasource = parameters["datasource"] - board = parameters.get("board", "") - post_count = int(dataset.data["num_rows"]) - annotation_fields = dataset.get_annotation_fields() - - # If the dataset is local, we can add some more features - # (like the ability to navigate to threads) - is_local = False - - if datasource in list(fourcat_modules.datasources.keys()): - is_local = True if fourcat_modules.datasources[datasource].get("is_local") else False - - # Check if we have to sort the data in a specific way. - sort_by = request.args.get("sort") - if sort_by == "dataset-order": - sort_by = None - - # Check if we have to reverse the order. - descending = request.args.get("desc") - if descending == "true" or descending == True: - descending = True - else: - descending = False - - # Check if we have to convert the sort value to an integer. - force_int = request.args.get("int") - if force_int == "true" or force_int == True: - force_int = True - else: - force_int = False - - # Load posts - post_ids = [] - posts = [] - count = 0 - - first_post = False - - for post in iterate_items(results_path, max_rows=max_posts, sort_by=sort_by, descending=descending, force_int=force_int): - - count += 1 - - # Use an offset if we're showing a page beyond the first. - if count <= offset: - continue - - # Attribute column names and collect dataset's posts. - post_ids.append(post["id"]) - posts.append(post) - - if "link_id" in post: - if post["link_id"][2] == "_": - post["link_id"] = post["link_id"][3:] - - # Stop if we exceed the max posts per page. - if count >= (offset + limit) or count > max_posts: - break - - # Include custom css if it exists in the datasource's 'explorer' dir. - # The file's naming format should e.g. be 'reddit-explorer.css'. - css = get_custom_css(datasource) - - # Include custom fields if it they are in the datasource's 'explorer' dir. - # The file's naming format should e.g. be 'reddit-explorer.json'. - # For some datasources (e.g. Twitter) we also have to explicitly set - # what data type we're working with. - filetype = dataset.get_extension() - custom_fields = get_custom_fields(datasource, filetype=filetype) - - # Convert posts from markdown to HTML - if custom_fields and "markdown" in custom_fields and custom_fields.get("markdown"): - posts = [convert_markdown(post) for post in posts] - # Clean up HTML - else: - posts = [strip_html(post) for post in posts] - posts = [format(post, datasource=datasource) for post in posts] - - if not posts: - return error(404, error="No posts available for this datasource") - - # Check whether there's already annotations inserted already. - # If so, also pass these to the template. - annotations = db.fetchone("SELECT * FROM annotations WHERE key = %s", (key,)) - if not annotations or not annotations.get("annotations"): - annotations = None - else: - annotations = json.loads(annotations["annotations"]) - - # Generate the HTML page - return render_template("explorer/explorer.html", key=key, datasource=datasource, board=board, is_local=is_local, parameters=parameters, annotation_fields=annotation_fields, annotations=annotations, posts=posts, custom_css=css, custom_fields=custom_fields, page=page, offset=offset, limit=limit, post_count=post_count, max_posts=max_posts) - -@app.route('/explorer/thread///') -@api_ratelimit -@login_required -@setting_required("privileges.can_use_explorer") -@openapi.endpoint("explorer") -def explorer_thread(datasource, board, thread_id): - """ - Show a thread in the explorer - - :param str datasource: Data source ID - :param str board: Board name - :param int thread_id: Thread ID - - :return-error 404: If the thread ID does not exist for the given data source. - """ - - if not datasource: - return error(404, error="No datasource provided") - if datasource not in config.get('datasources.enabled'): - return error(404, error="Invalid data source") - if not board: - return error(404, error="No board provided") - if not thread_id: - return error(404, error="No thread ID provided") - - # The amount of posts that may be included (limit for large datasets) - max_posts = config.get('explorer.max_posts', 500000) - - # Get the posts with this thread ID. - posts = get_posts(db, datasource, board=board, ids=tuple([thread_id]), threads=True, order_by=["id"]) - - if not posts: - return error(404, error="No posts available for this thread") - - posts = [strip_html(post) for post in posts] - posts = [format(post, datasource=datasource) for post in posts] - - # Include custom css if it exists in the datasource's 'explorer' dir. - # The file's naming format should e.g. be 'reddit-explorer.css'. - css = get_custom_css(datasource) - - # Include custom fields if it they are in the datasource's 'explorer' dir. - # The file's naming format should e.g. be 'reddit-explorer.json'. - custom_fields = get_custom_fields(datasource) - - return render_template("explorer/explorer.html", datasource=datasource, board=board, posts=posts, custom_css=css, custom_fields=custom_fields, limit=len(posts), post_count=len(posts), thread=thread_id, max_posts=max_posts) - -@app.route('/explorer/post///') -@api_ratelimit -@login_required -@setting_required("privileges.can_use_explorer") -@openapi.endpoint("explorer") -def explorer_post(datasource, board, thread_id): - """ - Show a thread in the explorer - - :param str datasource: Data source ID - :param str board: Board name - :param int thread_id: Thread ID - - :return-error 404: If the thread ID does not exist for the given data source. - """ - - if not datasource: - return error(404, error="No datasource provided") - if datasource not in config.get('datasources.enabled'): - return error(404, error="Invalid data source") - if not board: - return error(404, error="No board provided") - if not thread_id: - return error(404, error="No thread ID provided") - - # Get the posts with this thread ID. - posts = get_posts(db, datasource, board=board, ids=tuple([thread_id]), threads=True, order_by=["id"]) - - posts = [strip_html(post) for post in posts] - posts = [format(post) for post in posts] - - # Include custom css if it exists in the datasource's 'explorer' dir. - # The file's naming format should e.g. be 'reddit-explorer.css'. - css = get_custom_css(datasource) - - # Include custom fields if it they are in the datasource's 'explorer' dir. - # The file's naming format should e.g. be 'reddit-explorer.json'. - custom_fields = get_custom_fields(datasource) - - return render_template("explorer/explorer.html", datasource=datasource, board=board, posts=posts, custom_css=css, custom_fields=custom_fields, limit=len(posts), post_count=len(posts)) - -@app.route("/explorer/save_annotation_fields/", methods=["POST"]) -@api_ratelimit -@login_required -@setting_required("privileges.can_run_processors") -@setting_required("privileges.can_use_explorer") -@openapi.endpoint("explorer") -def save_annotation_fields(key): - """ - Save the annotation fields of a dataset to the datasets table. - If the changes to the annotation fields affect existing annotations, - this function also updates or deleted those old values. - - :param str key: The dataset key - - :return-error 404: If the dataset ID does not exist. - """ - - if not key: - return error(404, error="No dataset key provided") - - # Do some preperations - new_fields = request.get_json() - new_field_ids = set(new_fields.keys()) - text_fields = ["textarea", "text"] - option_fields = set() - - # Get dataset info. - dataset = db.fetchone("SELECT key, annotation_fields FROM datasets WHERE key = %s;", (key,)) - - if not dataset: - return error(404, error="Dataset not found") - - # We're saving the annotation fields as-is - db.execute("UPDATE datasets SET annotation_fields = %s WHERE key = %s;", (json.dumps(new_fields), key)) - - # If fields and annotations were saved before, we must also check whether we need to - # change old annotation data, for instance when a field is deleted or its label has changed. - - # Get the annotation fields that were already saved to check what's changed. - old_fields = dataset.get("annotation_fields") - if old_fields: - old_fields = json.loads(old_fields) - - # Get the annotations - if old_fields: - annotations = db.fetchone("SELECT annotations FROM annotations WHERE key = %s;", (key,)) - if annotations and "annotations" in annotations: - if not annotations["annotations"]: - annotations = None - else: - annotations = json.loads(annotations["annotations"]) - - # If there's old fields *and* annotations saved, we need to check if we need to update stuff. - if old_fields and annotations: - - fields_to_delete = set() - labels_to_update = {} - options_to_delete = set() - options_to_update = {} - - for field_id, field in old_fields.items(): - - # We'll delete all prior annotations for a field if its input field is deleted - if field_id not in new_field_ids: - - # Labels are used as keys in the annotations table - # They should already be unique, so that's okay. - fields_to_delete.add(field["label"]) - continue - - # If the type has changed, also delete prior references (except between text and textarea) - new_type = new_fields[field_id]["type"] - if field["type"] != new_type: - - if not field["type"] in text_fields and not new_type in text_fields: - fields_to_delete.add(field["label"]) - continue - - # If the label has changed, change it in the old annotations - old_label = old_fields[field_id]["label"] - new_label = new_fields[field_id]["label"] - - if old_label != new_label: - labels_to_update[old_label] = new_label - - # Check if the options for dropdowns or checkboxes have changed - if new_type == "checkbox" or new_type == "dropdown": - - if "options" in old_fields[field_id]: - - option_fields.add(old_fields[field_id]["label"]) - new_options = new_fields[field_id]["options"] - - new_ids = [list(v.keys())[0] for v in new_options] - new_ids = [list(v.keys())[0] for v in new_options] - - # If it's a dropdown or checkbox.. - for option in old_fields[field_id]["options"]: - option_id = list(option.keys())[0] - option_label = list(option.values())[0] - - # If this ID is not present anymore, delete it - if option_id not in new_ids: - options_to_delete.add(option_label) - continue - - # Change the label if it has changed. Bit ugly but it works. - new_label = [list(new_option.values())[0] for i, new_option in enumerate(new_options) if list(new_options[i].keys())[0] == option_id][0] - - if option_label != new_label: - options_to_update[option_label] = new_label - - # Loop through the old annotations if things need to be changed - if fields_to_delete or labels_to_update or options_to_update or options_to_delete: - - for post_id in list(annotations.keys()): - - for field_label in list(annotations[post_id].keys()): - - # Delete the field entirely - if field_label in fields_to_delete: - del annotations[post_id][field_label] - continue - - # Update the label - if field_label in labels_to_update: - annotations[post_id][labels_to_update[field_label]] = annotations[post_id].pop(field_label) - field_label = labels_to_update[field_label] - - # Update or delete option values - if field_label in option_fields: - options_inserted = annotations[post_id][field_label] - - # We can just delete/change the entire annotation if its a string - if type(options_inserted) == str: - - # Delete the option if it's not present anymore - if options_inserted in options_to_delete: - del annotations[post_id][field_label] - - # Update the option label if it has changed - elif options_inserted in options_to_update: - annotations[post_id][field_label] = options_to_update[options_inserted] - - # For lists (i.e. checkboxes), we have to loop - elif type(options_inserted) == list: - - for option_inserted in options_inserted: - - # Delete the option if it's not present anymore - if option_inserted in options_to_delete: - annotations[post_id][field_label].remove(option_inserted) - - # Update the option label if it has changed - elif option_inserted in options_to_update: - annotations[post_id][field_label] = options_to_update[option_inserted] - - # Delete entire post dict if there's nothing left - if not annotations[post_id]: - del annotations[post_id] - - # Save annotations as an empty string if there's none. - if not annotations: - annotations = "" - else: - annotations = json.dumps(annotations) - - # Insert into the annotations table. - db.execute("INSERT INTO annotations(key, annotations) VALUES(%s, %s) ON CONFLICT (key) DO UPDATE SET annotations = %s ", (key, annotations, annotations)) - - return "success" - -@app.route("/explorer/save_annotations/", methods=["POST"]) -@api_ratelimit -@login_required -@setting_required("privileges.can_run_processors") -@setting_required("privileges.can_use_explorer") -@openapi.endpoint("explorer") -def save_annotations(key): - """ - Save the annotations of a dataset to the annotations table. - - :param str key: The dataset key - - :return-error 404: If the dataset ID does not exist. - """ - - if not key: - return error(404, error="No dataset key provided") - - new_annotations = request.get_json() - - # If there were already annotations added, we need to make sure - # we're not incorrectly overwriting any. - # We also need to check whether any of the input fields have changed. - # If so, we're gonna edit or remove their old values. - old_annotations = db.fetchone("SELECT annotations FROM annotations WHERE key = %s;", (key,)) - - if old_annotations: - - if "annotations" in old_annotations and old_annotations["annotations"]: - old_annotations = json.loads(old_annotations["annotations"]) - - # Loop through all new annotations and add/overwrite them - # with the old annotations dict. - for post_id in list(new_annotations.keys()): - old_annotations[post_id] = new_annotations[post_id] - if not old_annotations[post_id]: - del old_annotations[post_id] - - new_annotations = old_annotations - - if not new_annotations: - new_annotations = "" - else: - new_annotations = json.dumps(new_annotations) - - # We're saving all annotations as a JSON string in one go - db.execute("INSERT INTO annotations(key, annotations) VALUES(%s, %s) ON CONFLICT (key) DO UPDATE SET annotations = %s ", (key, new_annotations, new_annotations)) - - return "success" - -@app.route('/api//boards.json') -@api_ratelimit -@login_required -@setting_required("privileges.can_use_explorer") -@openapi.endpoint("data") -def get_boards(datasource): - """ - Get available boards in datasource - - :param datasource: The datasource for which to acquire the list of available - boards. - :return: A list containing a list of `boards`, as string IDs. - - :return-schema: {type=object,properties={ - boards={type=array,items={type=object,properties={ - board={type=string} - }}} - }} - - :return-error 404: If the datasource does not exist. - """ - if datasource not in config.get('datasources.enabled'): - return error(404, error="Invalid data source") - - boards = db.fetchall("SELECT DISTINCT board FROM threads_" + datasource) - return jsonify({"boards": [{"board": board["board"]} for board in boards]}) - -@app.route('/api/image/') -@app.route('/api/imagefile/') -@login_required -@setting_required("privileges.can_use_explorer") -def get_image_file(img_file, limit=0): - """ - Returns an image based on filename - Request should hex the md5 hashes first (e.g. with hexdigest()) - - """ - if not re.match(r"([a-zA-Z0-9]+)\.([a-z]+)", img_file): - abort(404) - - image_path = Path(config.get('PATH_ROOT'), config.get('PATH_IMAGES'), img_file) - if not image_path.exists(): - abort(404) - - return send_file(str(image_path)) - -def iterate_items(in_file, max_rows=None, sort_by=None, descending=False, force_int=False): - """ - Loop through both csv and NDJSON files. - :param in_file, str: The input file to read. - :param sort_by, str: The key that determines the sort order. - :param descending, bool: Whether to sort by descending values. - :param force_int, bool: Whether the sort value should be converted to an - integer. - """ - - suffix = in_file.name.split(".")[-1].lower() - - if suffix == "csv": - - with open(in_file, "r", encoding="utf-8") as dataset_file: - - # Sort on date by default - # Unix timestamp integers are not always saved in the same field. - reader = csv.reader(dataset_file) - columns = next(reader) - if sort_by: - try: - # Get index number of sort_by value - sort_by_index = columns.index(sort_by) - - # Generate reader on the basis of sort_by value - reader = sorted(reader, key=lambda x: to_float(x[sort_by_index], convert=force_int) if len(x) >= sort_by_index else 0, reverse=descending) - - except (ValueError, IndexError) as e: - pass - - for item in reader: - - # Add columns - item = {columns[i]: item[i] for i in range(len(item))} - - yield item - - elif suffix == "ndjson": - - # In this format each line in the file is a self-contained JSON - # file - with open(in_file, "r", encoding="utf-8") as dataset_file: - - # Unfortunately we can't easily sort here. - # We're just looping through the file if no sort is given. - if not sort_by: - for line in dataset_file: - item = json.loads(line) - yield item - - # If a sort order is given explicitly, we're sorting anyway. - else: - keys = sort_by.split(".") - - if max_rows: - for item in sorted([json.loads(line) for i, line in enumerate(dataset_file) if i < max_rows], key=lambda x: to_float(get_nested_value(x, keys), convert=force_int), reverse=descending): - yield item - else: - for item in sorted([json.loads(line) for line in dataset_file], key=lambda x: to_float(get_nested_value(x, keys), convert=force_int), reverse=descending): - yield item - - return Exception("Can't loop through file with extension %s" % suffix) - -def get_posts(db, datasource, ids, board="", threads=False, limit=0, offset=0, order_by=["timestamp"]): - - if not ids: - return None - - if board: - board = " AND board = '" + board + "' " - - id_field = "id" if not threads else "thread_id" - order_by = " ORDER BY " + ", ".join(order_by) - limit = "" if not limit or limit <= 0 else " LIMIT %i" % int(limit) - offset = " OFFSET %i" % int(offset) - - posts = db.fetchall("SELECT * FROM posts_" + datasource + " WHERE " + id_field + " IN %s " + board + order_by + " ASC" + limit + offset, - (ids,)) - if not posts: - return False - - return posts - -def get_custom_css(datasource): - """ - Check if there's a custom css file for this dataset. - If so, return the text. - Custom css files should be placed in an 'explorer' directory in the the datasource folder and named - '-explorer.css' (e.g. 'reddit/explorer/reddit-explorer.css'). - See https://github.com/digitalmethodsinitiative/4cat/wiki/Exploring-and-annotating-datasets for more information. - - :param datasource, str: Datasource name - - :return: The css as string. - """ - - # Set the directory name of this datasource. - # Some naming inconsistensies are caught here - if datasource == "twitter": - datasource_dir = "twitter-import" - datasource = "twitter-import" - else: - datasource_dir = datasource - - - css_path = Path(config.get('PATH_ROOT'), "datasources", datasource_dir, "explorer", datasource.lower() + "-explorer.css") - - print(css_path) - read = False - if css_path.exists(): - read = True - else: - # Allow both hypens and underscores in datasource name (to avoid some legacy issues) - css_path = re.sub(datasource, datasource.replace("-", "_"), str(css_path.absolute())) - if Path(css_path).exists(): - read = True - - # Read the css file if it exists - if read: - with open(css_path, "r", encoding="utf-8") as css: - css = css.read() - else: - css = None - - return css - -def get_custom_fields(datasource, filetype=None): - """ - Check if there are custom fields that need to be showed for this datasource. - If so, return a dictionary of those fields. - Custom field json files should be placed in an 'explorer' directory in the the datasource folder and named - '-explorer.json' (e.g. 'reddit/explorer/reddit-explorer.json'). - See https://github.com/digitalmethodsinitiative/4cat/wiki/Exploring-and-annotating-datasets for more information. - - :param datasource, str: Datasource name - :param filetype, str: The filetype that is handled. This can fluctuate - between e.g. NDJSON and csv files. - - :return: Dictionary of custom fields that should be shown. - """ - - # Set the directory name of this datasource. - if datasource == "twitter": - datasource_dir = "twitter-import" - datasource = "twitter-import" - else: - datasource_dir = datasource - - json_path = Path(config.get('PATH_ROOT'), "datasources", datasource_dir, "explorer", datasource.lower() + "-explorer.json") - read = False - - if json_path.exists(): - read = True - else: - # Allow both hypens and underscores in datasource name (to avoid some legacy issues) - json_path = re.sub(datasource, datasource.replace("-", "_"), str(json_path.absolute())) - if Path(json_path).exists(): - read = True - - if read: - with open(json_path, "r", encoding="utf-8") as json_file: - try: - custom_fields = json.load(json_file) - except ValueError as e: - return ("invalid", e) - else: - custom_fields = None - - filetype = filetype.replace(".", "") - if filetype and custom_fields: - if filetype in custom_fields: - custom_fields = custom_fields[filetype] - else: - custom_fields = None - - return custom_fields - -def get_nested_value(di, keys): - """ - Gets a nested value on the basis of a dictionary and a list of keys. - """ - - for key in keys: - di = di.get(key) - if not di: - return 0 - return di - -def to_float(value, convert=False): - if convert: - if not value: - value = 0 - else: - value = float(value) - return value - -def strip_html(post): - post["body"] = strip_tags(post.get("body", "")) - return post - -def format(post, datasource=""): - if "chan" in datasource or datasource == "8kun": - post["body"] = format_chan_post(post.get("body", "")) - post["body"] = post.get("body", "").replace("\n", "
      ") - return post - -def convert_markdown(post): - post["body"] = post.get("body", "").replace("\n", "\n\n").replace(">", ">").replace("] (", "](") - post["body"] = markdown2.markdown(post.get("body", ""), extras=["nofollow","target-blank-links"]) - return post diff --git a/webtool/views/views_admin.py b/webtool/views/views_admin.py index 9e09c9f06..78b028be3 100644 --- a/webtool/views/views_admin.py +++ b/webtool/views/views_admin.py @@ -568,10 +568,12 @@ def manipulate_settings(): flash("Invalid settings: %s" % str(e)) all_settings = config.get_all(user=None, tags=[tag]) + options = {} changed_categories = set() - for option in sorted({*all_settings.keys(), *definition.keys()}): + + for option in {*all_settings.keys(), *definition.keys()}: tag_value = all_settings.get(option, definition.get(option, {}).get("default")) global_value = global_settings.get(option, definition.get(option, {}).get("default")) is_changed = tag and global_value != tag_value @@ -613,7 +615,16 @@ def manipulate_settings(): changed_categories.add(option.split(".")[0]) tab = "" if not request.form.get("current-tab") else request.form.get("current-tab") - options = {k: options[k] for k in sorted(options, key=lambda o: options[o]["tabname"])} + + # We are ordering the options based on how they are ordered in their dictionaries, + # and not the database order. To do so, we're adding a simple config order number + # and sort on this. + config_order = 0 + for k, v in definition.items(): + options[k]["config_order"] = config_order + config_order += 1 + + options = {k: options[k] for k in sorted(options, key=lambda o: (options[o]["tabname"], options[o].get("config_order", 0)))} # 'data sources' is one setting but we want to be able to indicate # overrides per sub-item diff --git a/webtool/views/views_dataset.py b/webtool/views/views_dataset.py index 1720b3a84..bdd86a3f0 100644 --- a/webtool/views/views_dataset.py +++ b/webtool/views/views_dataset.py @@ -179,6 +179,8 @@ def get_mapped_result(key): processor of the dataset has a method for mapping its data to CSV, then this route uses that to convert the data to CSV on the fly and serve it as such. + We also use this if there's annotation data saved. + :param str key: Dataset key """ try: @@ -190,22 +192,6 @@ def get_mapped_result(key): config.get("privileges.can_view_private_datasets") or dataset.is_accessible_by(current_user)): return error(403, error="This dataset is private.") - if dataset.get_extension() == ".csv": - # if it's already a csv, just return the existing file - return url_for("get_result", query_file=dataset.get_results_path().name) - - if not hasattr(dataset.get_own_processor(), "map_item"): - # cannot map without a mapping method - return error(404, error="File not found.") - - # Also add possibly added annotation items. - # These cannot be added to the static `map_item` function. - annotation_labels = None - annotation_fields = dataset.get_annotation_fields() - if annotation_fields: - annotation_labels = ["annotation_" + v["label"] for v in annotation_fields.values()] - annotations = dataset.get_annotations() - def map_response(): """ Yield a CSV file line by line @@ -219,10 +205,6 @@ def map_response(): for item in dataset.iterate_items(processor=dataset.get_own_processor(), warn_unmappable=False): if not writer: fieldnames = list(item.keys()) - if annotation_labels: - for label in annotation_labels: - if label not in fieldnames: - fieldnames.append(label) writer = csv.DictWriter(buffer, fieldnames=fieldnames) writer.writeheader() @@ -230,10 +212,6 @@ def map_response(): buffer.truncate(0) buffer.seek(0) - if annotation_fields: - for label in annotation_labels: - item[label] = annotations.get(item.get("id"), {}).get(label, "") - writer.writerow(item) yield buffer.getvalue() buffer.truncate(0) @@ -430,7 +408,7 @@ def show_result(key): datasources = fourcat_modules.datasources datasource_expiration = config.get("datasources.expiration", {}).get(datasource, {}) expires_datasource = False - can_unexpire = ((config.get('expire.allow_optout') and \ + can_unexpire = ((config.get("expire.allow_optout") and \ datasource_expiration.get("allow_optout", True)) or datasource_expiration.get("allow_optout", False)) \ and (current_user.is_admin or dataset.is_accessible_by(current_user, "owner")) @@ -444,6 +422,8 @@ def show_result(key): elif dataset.parameters.get("expires-after"): timestamp_expires = dataset.parameters.get("expires-after") + has_explorer = config.get("explorer.config", {}).get(datasource, {}).get("enabled", False) + # if the dataset has parameters with credentials, give user the option to # erase them has_credentials = [p for p in dataset.parameters if p.startswith("api_") and p not in ("api_type", "api_track")] @@ -456,7 +436,8 @@ def show_result(key): return render_template(template, dataset=dataset, parent_key=dataset.key, processors=fourcat_modules.processors, is_processor_running=is_processor_running, messages=get_flashed_messages(), is_favourite=is_favourite, timestamp_expires=timestamp_expires, has_credentials=has_credentials, - expires_by_datasource=expires_datasource, can_unexpire=can_unexpire, datasources=datasources) + expires_by_datasource=expires_datasource, can_unexpire=can_unexpire, has_explorer=has_explorer, + datasources=datasources) @app.route('/results//processors/queue//', methods=["GET", "POST"]) diff --git a/webtool/views/views_explorer.py b/webtool/views/views_explorer.py new file mode 100644 index 000000000..51e9acc66 --- /dev/null +++ b/webtool/views/views_explorer.py @@ -0,0 +1,379 @@ +""" +4CAT Explorer views - pages that display datasets in a legible +format and lets users annotate the data. +""" + +from pathlib import Path + +from flask import request, render_template, jsonify +from flask_login import login_required, current_user +from webtool import app, db, openapi, limiter, config, fourcat_modules +from webtool.lib.helpers import error, setting_required +from common.lib.dataset import DataSet +from common.lib.helpers import convert_to_float, hash_to_md5 +from common.lib.exceptions import DataSetException, AnnotationException +from common.config_manager import ConfigWrapper + +config = ConfigWrapper(config, user=current_user, request=request) +api_ratelimit = limiter.shared_limit("45 per minute", scope="api") + + +@app.route("/results//explorer/", defaults={"page": 1, "show_annotations": False}) +@app.route("/results//explorer/page/") +@api_ratelimit +@login_required +@setting_required("privileges.can_use_explorer") +@openapi.endpoint("explorer") +def explorer_dataset(dataset_key: str, page=1, show_annotations=False): + """ + Show posts from a dataset + + :param str dataset_key: Dataset key + + :return-schema: {type=array,items={type=integer}} + + :return-error 404: If the dataset does not exist. + """ + + # Get dataset info. + try: + dataset = DataSet(key=dataset_key, db=db, modules=fourcat_modules) + except DataSetException: + return error(404, error="Dataset not found.") + + # Load some variables + parameters = dataset.get_parameters() + datasource = parameters["datasource"] + post_count = int(dataset.data["num_rows"]) + annotation_fields = dataset.get_annotation_fields() + warning = "" + + # See if we can actually serve this page + if dataset.is_private and not (config.get("privileges.can_view_all_datasets") or dataset.is_accessible_by(current_user)): + return error(403, error="This dataset is private.") + + if len(dataset.get_genealogy()) > 1: + return error(404, error="Only available for top-level datasets.") + + results_path = dataset.check_dataset_finished() + if not results_path: + return error(404, error="This dataset didn't finish executing.") + + if not config.get("explorer.config", {}).get(datasource,{}).get("enabled"): + return error(404, error="Explorer functionality disabled for %s." % datasource) + + # The amount of posts to show on a page + posts_per_page = config.get("explorer.posts_per_page", 50) + + # The amount of posts that may be included (limit for large datasets) + max_posts = config.get('explorer.max_posts', 500000) + + # The offset for posts depending on the current page + offset = ((page - 1) * posts_per_page) if page else 0 + + # If the dataset is generated from an API-accessible database, we can add + # extra features like the ability to navigate across posts. + has_database = False # todo: integrate + + # Check if we have to sort the data. + sort = request.args.get("sort") + + # Check if we have to reverse the order. + reverse = True if request.args.get("order") == "reverse" else False + + # Load posts + post_ids = [] + posts = [] + count = 0 + + # Load annotations with post IDs as keys and their annotations as lists. + annotations = {} + + # We don't need to sort if we're showing the existing dataset order (default). + # If we're sorting, we need to iterate over the entire dataset first. + if not sort or (sort == "dataset-order" and not reverse): + for row in dataset.iterate_items(warn_unmappable=False): + + count += 1 + + # Use an offset if we're showing a page beyond the first. + if count <= offset: + continue + + # Attribute column names and collect dataset's posts. + post_ids.append(row["id"]) + posts.append(row) + + # Stop if we exceed the allowed posts per page or max posts. + if count >= (offset + posts_per_page) or count > max_posts: + break + else: + for row in sort_and_iterate_items(dataset, sort, reverse=reverse, warn_unmappable=False): + count += 1 + if count <= offset: + continue + post_ids.append(row["id"]) + posts.append(row) + if count >= (offset + posts_per_page) or count > max_posts: + break + + if not posts: + return error(404, error="No posts or posts could not be displayed") + + # Check whether there's already annotations made. + # If so, also pass these to the template and set the post ID + # as key, so we can easily retrieve them. + for post_id in post_ids: + annotations[post_id] = dataset.get_annotations(item_id=post_id) + + # We can use either a generic or a pre-made, data source-specific template. + template = "datasource" if has_datasource_template(datasource) else "generic" + if template == "generic": + posts_css = Path(config.get('PATH_ROOT'), "webtool/static/css/explorer/generic.css") + else: + posts_css = Path(config.get('PATH_ROOT'), "webtool/static/css/explorer/" + datasource + ".css") + + # Read CSS and pass as a string + with open(posts_css, "r", encoding="utf-8") as css: + posts_css = css.read() + + # Generate the HTML page + return render_template("explorer/explorer.html", dataset=dataset, datasource=datasource, has_database=has_database, posts=posts, annotation_fields=annotation_fields, annotations=annotations, template=template, posts_css=posts_css, page=page, offset=offset, posts_per_page=posts_per_page, post_count=post_count, max_posts=max_posts, warning=warning) + +@app.route("/explorer/save_annotation_fields/", methods=["POST"]) +@api_ratelimit +@login_required +@setting_required("privileges.can_run_processors") +@setting_required("privileges.can_use_explorer") +@openapi.endpoint("explorer") +def explorer_save_annotation_fields(dataset_key: str): + """ + Save the annotation fields of a dataset to the datasets table. + + :param dataset_key: The dataset key. + + :return-error 404: If the dataset ID does not exist. + :return int: The number of annotation fields saved. + """ + + # Get dataset. + if not dataset_key: + return error(404, error="No dataset key provided") + try: + dataset = DataSet(key=dataset_key, db=db) + except DataSetException: + return error(404, error="Dataset not found.") + + # Save it! + annotation_fields = request.get_json() + + # Field IDs are not immediately set in the front end. + # We're going to do this based on the hash of the + # dataset key and the input label (should be unique) + field_keys = list(annotation_fields.keys()) + for field_id in field_keys: + if "tohash" in field_id: + new_field_id = hash_to_md5(dataset_key + annotation_fields[field_id]["label"]) + annotation_fields[new_field_id] = annotation_fields[field_id] + del annotation_fields[field_id] + + try: + fields_saved = dataset.save_annotation_fields(annotation_fields) + except AnnotationException as e: + # If anything went wrong with the annotation field saving, return an error. + return jsonify(error=str(e)), 400 + + # Else return the amount of fields saved. + return str(fields_saved) + +@app.route("/explorer/save_annotations/", methods=["POST"]) +@api_ratelimit +@login_required +@setting_required("privileges.can_run_processors") +@setting_required("privileges.can_use_explorer") +@openapi.endpoint("explorer") +def explorer_save_annotations(dataset_key: str): + """ + Save the annotations of a dataset to the annotations table. + + :param dataset_key: The dataset key. Must be explicitly given to ensure + annotations are tied to a dataset + + :return-error 404: If the dataset key does not exist. + + """ + + # Save it! + annotations = request.get_json() + try: + dataset = DataSet(key=dataset_key, db=db) + except DataSetException: + return error(404, error="Dataset not found.") + + try: + annotations_saved = dataset.save_annotations(annotations, overwrite=True) + except AnnotationException as e: + # If anything went wrong with the annotation field saving, return an error. + return jsonify(error=str(e)), 400 + + # Else return the amount of fields saved. + return str(annotations_saved) + +def sort_and_iterate_items(dataset: DataSet, sort="", reverse=False, **kwargs) -> dict: + """ + Loop through both csv and NDJSON files. + This is basically a wrapper function for `iterate_items()` with the + added functionality of sorting a dataset. Because the Explorer is (currently) + the only feature that requires sorting, we define it here. + This first iterates through the entire file (with a max limit) to determine + an order. Then it yields items based on this order. + + :param dataset: The dataset object. + :param sort: The item key that determines the sort order. + :param reverse: Whether to sort by largest values first. + + :returns dict: Yields iterated post + """ + + # Storing posts in the right order here + sorted_posts = [] + + # Use reversed() if we're reading the dataset from back to front. + if sort == "dataset-order" and reverse: + for item in reversed(list(dataset.iterate_items(**kwargs))): + sorted_posts.append(item) + + # Sort on the basis of a column value + else: + try: + for item in sorted(dataset.iterate_items(**kwargs), key=lambda x: x.get(sort,""), reverse=reverse): + sorted_posts.append(item) + except TypeError: + # Dataset fields can contain integers and empty strings. + # Since these cannot be compared, we will convert every + # empty string to 0. + for item in sorted(dataset.iterate_items(**kwargs), key=lambda x: convert_to_float(x.get(sort,"")), reverse=reverse): + sorted_posts.append(item) + + for post in sorted_posts: + yield post + + +def has_datasource_template(datasource: str) -> bool: + """ + Check if the data source has a data source-specific template. + This requires HTML and CSS files. + Custom HTML files should be placed in `webtool/templates/explorer/datasource-templates/.html`. + Custom CSS files should be placed in `webtool/static/css/explorer/.css`. + + :param datasource: Datasource name. + + :returns: bool, Whether the required files are present. + """ + css_exists = Path(config.get('PATH_ROOT'), "webtool/static/css/explorer/" + datasource + ".css").exists() + html_exists = Path(config.get('PATH_ROOT'), "webtool/templates/explorer/datasource-templates/" + datasource + ".html").exists() + + if css_exists and html_exists: + return True + return False + +def get_database_posts(db, datasource, ids, board="", threads=False, limit=0, offset=0, order_by=["timestamp"]): + """ + todo: Integrate later + Retrieve posts by ID from a database-accessible data source. + """ + + raise NotImplementedError + + if not ids: + return None + + if board: + board = " AND board = '" + board + "' " + + id_field = "id" if not threads else "thread_id" + order_by = " ORDER BY " + ", ".join(order_by) + limit = "" if not limit or limit <= 0 else " LIMIT %i" % int(limit) + offset = " OFFSET %i" % int(offset) + + posts = db.fetchall("SELECT * FROM posts_" + datasource + " WHERE " + id_field + " IN %s " + board + order_by + " ASC" + limit + offset, + (ids,)) + if not posts: + return False + + return posts + +@app.route('/results///explorer') +@api_ratelimit +@login_required +@setting_required("privileges.can_use_explorer") +@openapi.endpoint("explorer") +def explorer_api_thread(datasource, thread_id): + """ + todo: INTEGRATE LATER! + + Show a thread from an API-accessible database. + + :param str datasource: Data source ID + :param str board: Board name + :param int thread_id: Thread ID + + :return-error 404: If the thread ID does not exist for the given data source. + """ + raise NotImplementedError + + if not datasource: + return error(404, error="No datasource provided") + if datasource not in config.get('datasources.enabled'): + return error(404, error="Invalid data source") + if not thread_id: + return error(404, error="No thread ID provided") + + # The amount of posts that may be included (limit for large datasets) + max_posts = config.get('explorer.max_posts', 500000) + + # Get the posts with this thread ID. + #todo: define function get_api_posts + posts = get_api_posts(db, datasource, ids=tuple([thread_id]), threads=True, order_by=["id"]) + + if not posts: + return error(404, error="No posts available for this thread") + + posts = [strip_html(post) for post in posts] + posts = [format(post, datasource=datasource) for post in posts] + + return render_template("explorer/explorer.html", datasource=datasource, posts=posts, datasource_config=datasource_config, posts_per_page=len(posts), post_count=len(posts), thread=thread_id, max_posts=max_posts) + +@app.route('/explorer/post///') +@api_ratelimit +@login_required +@setting_required("privileges.can_use_explorer") +@openapi.endpoint("explorer") +def explorer_api_posts(datasource, post_ids): + """ + todo: INTEGRATE LATER + + Show posts from an API-accessible database. + + :param str datasource: Data source ID + :param str board: Board name + :param int post_ids: Post IDs + + :return-error 404: If the thread ID does not exist for the given data source. + """ + raise NotImplementedError + + if not datasource: + return error(404, error="No datasource provided") + if datasource not in config.get('datasources.enabled'): + return error(404, error="Invalid data source") + if not post_ids: + return error(404, error="No thread ID provided") + + # Get the posts with this thread ID. + posts = get_database_posts(db, datasource, board=board, ids=tuple([post_ids]), threads=True, order_by=["id"]) + + posts = [strip_html(post) for post in posts] + posts = [format(post) for post in posts] + + return render_template("explorer/explorer.html", datasource=datasource, board=board, posts=posts, datasource_config=datasource_config, posts_per_page=len(posts), post_count=len(posts)) \ No newline at end of file