diff --git a/unfurl/parsers/parse_tiktok.py b/unfurl/parsers/parse_tiktok.py index c97b2e2..712e689 100644 --- a/unfurl/parsers/parse_tiktok.py +++ b/unfurl/parsers/parse_tiktok.py @@ -12,6 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. +from datetime import datetime +from unfurl.utils import extract_bits, set_bits + import logging log = logging.getLogger(__name__) @@ -25,30 +28,95 @@ def parse_tiktok_id(unfurl, node, on_tiktok=True): + """Parse a TikTok ID, extracting the timestamp, sequence, machine ID, and entity type. + + TikTok IDs are 64 bits long; often rendered as a large + number (6854717870488702213). + + Bits | Field | Description + ------|--------------|-------------------------------- + 63:32 | Timestamp | in epoch seconds format + 31:22 | Milliseconds | 0-999 in most cases + 21:19 | | + 18:14 | Sequence | + 13:12 | | + 11:8 | Entity Type | 5 values for 4 entity types + 7:0 | Machine ID | + + Ref: https://arxiv.org/pdf/2504.13279 +""" try: tiktok_id = int(node.value) - # TikTok IDs are 63 bits long; the upper 31 bits are the timestamp. - # Shift it 32 (63-32 = 31) bits right, leaving just the timestamp bits. - timestamp = (tiktok_id >> 32) - - except Exception as e: - log.exception(f'Exception parsing TikTok ID: {e}') + except ValueError as e: + # Valid TikTok IDs should be ints, so this wasn't one return + timestamp = extract_bits(tiktok_id, 32, 64) + milliseconds = extract_bits(tiktok_id, 23, 32) + sequence = extract_bits(tiktok_id, 14, 19) + entity_type = extract_bits(tiktok_id, 8, 12) + machine_id = extract_bits(tiktok_id, 0, 8) + + entity_type_map = { + 0: 'User Account', + 4: 'User Account', + 6: 'Device (smartphone or web browser)', + 11: 'Live session', + 13: 'Video' + } + entity_type_str = entity_type_map.get(entity_type, 'Unknown') + if on_tiktok: node.hover = 'TikTok IDs are time-based IDs similar to those of Twitter Snowflakes.' unfurl.add_to_queue( - data_type='epoch-seconds', key=None, value=timestamp, label=f'Timestamp: {timestamp}', - hover='The leading 31 bits in a TikTok ID are a timestamp, ' - '
thought to represent when an item was posted or created.', + data_type='epoch-seconds', key=None, value=float(f"{timestamp}.{milliseconds}"), + label=f'Timestamp: {timestamp}.{milliseconds:03d}', + # Ref: https://arxiv.org/pdf/2504.13279 + hover='The leading 42 bits in a TikTok ID are a timestamp, thought to represent ' + 'when an item was posted or created. The first 32 bits can be interpreted ' + 'as a Unix timestamp and the next 10 bits as the milliseconds.', + parent_id=node.node_id, incoming_edge_config=tiktok_edge) + + unfurl.add_to_queue( + data_type='integer', key=None, value=sequence, label=f'Sequence: {sequence}', + hover='Thought to be a counter or sequence number, per https://arxiv.org/pdf/2504.13279', parent_id=node.node_id, incoming_edge_config=tiktok_edge) + unfurl.add_to_queue( + data_type='integer', key=None, value=machine_id, label=f'Machine ID: {machine_id}', + hover='Thought to be a data center or machine ID, with different geographic areas correlating to different IDs.', + parent_id=node.node_id, incoming_edge_config=tiktok_edge) + + unfurl.add_to_queue( + data_type='descriptor', key=None, value=entity_type_str, label=f'Entity Type: {entity_type_str}', + hover='Thought to be a counter or sequence number, per https://arxiv.org/pdf/2504.13279', + parent_id=node.node_id, incoming_edge_config=tiktok_edge) + + +def create_tiktok_id(timestamp=None, days_ahead=None, sequence=0, machine_id=1, entity_type='video'): + # Neither are set; make the timestamp now. + if not timestamp and not days_ahead: + timestamp = int(datetime.now().timestamp()) + # timestamp is a string; parse it to epoch seconds + elif isinstance(timestamp, str): + timestamp = int(datetime.fromisoformat(timestamp).timestamp()) + # Make the timestamp now + days_ahead + elif not timestamp and days_ahead: + timestamp = int(datetime.now().timestamp()) + (days_ahead * 86400) + + timestamp_bits = set_bits(timestamp, 32) + sequence_bits = set_bits(sequence, 14) + machine_id_bits = set_bits(machine_id, 0) + entity_type_bits = set_bits(13, 8) + + return int(timestamp_bits + sequence_bits + machine_id_bits + entity_type_bits) def run(unfurl, node): + min_reasonable_date = create_tiktok_id('2017-12-01T00:00:00') + max_reasonable_date = create_tiktok_id(days_ahead=365) if node.data_type == 'url.path.segment': - if 'tiktok.com' in unfurl.find_preceding_domain(node): if node.key == 1: if node.value.startswith('@'): @@ -60,11 +128,11 @@ def run(unfurl, node): data_type='descriptor', key=None, value='TikTok video was embedded on another site', parent_id=node.node_id, incoming_edge_config=tiktok_edge) - # Check if TikTok ID timestamp would be between 2017-12 and 2025-05 - elif unfurl.check_if_int_between(node.value, 6500000000000000000, 7500000000000000000): + # Check if TikTok ID timestamp would be "reasonable" + elif unfurl.check_if_int_between(node.value, min_reasonable_date, max_reasonable_date): parse_tiktok_id(unfurl, node) # If it's the "root" node and a plausible TikTok ID, parse it. # This case covers someone parsing just a TikTok ID, not a full URL. - elif node.node_id == 1 and unfurl.check_if_int_between(node.value, 6500000000000000000, 7500000000000000000): + elif node.node_id == 1 and unfurl.check_if_int_between(node.value, min_reasonable_date, max_reasonable_date): parse_tiktok_id(unfurl, node, on_tiktok=False) diff --git a/unfurl/utils.py b/unfurl/utils.py index 3057cca..79a77ad 100644 --- a/unfurl/utils.py +++ b/unfurl/utils.py @@ -69,3 +69,25 @@ def wrap_hover_text(hover_text: Union[str, None]) -> Union[str, None]: # "Wrap" the hover text by splitting it into lines of length , # then joining them together with a
. return '
'.join(textwrap.wrap(hover_text, width=60)) + + +def extract_bits(identifier: int, start: int, end: int) -> int: + """ + Extract a subset of bits from an integer based on specified start and + end positions. This operation shifts the specified bit range to the + rightmost (least-significant) position and applies a mask to select + the desired bits. + + :param identifier: The integer value from which bits will be extracted. + :param start: The starting index of the bit range (inclusive). + :param end: The ending index of the bit range (exclusive). + :return: Extracted bits as an integer. + """ + + shifted = identifier >> start + mask = (1 << (end - start)) - 1 + return shifted & mask + +def set_bits(value: int, offset: int, max_size=None) -> int: + return int(value << offset) +