|
| 1 | +#!/usr/bin/env python3 |
| 2 | +# Use z3 to solve UTF-8 validation DFA for offset and transition table, |
| 3 | +# in order to encode transition table into u32. |
| 4 | +# We minimize the output variables in the solution to make it deterministic. |
| 5 | +# Ref: <https://gist.github.com/dougallj/166e326de6ad4cf2c94be97a204c025f> |
| 6 | +# See more detail explanation in `./validations.rs`. |
| 7 | +# |
| 8 | +# It is expected to find a solution in <30s on a modern machine, and the |
| 9 | +# solution is appended to the end of this file. |
| 10 | +from z3 import * |
| 11 | + |
| 12 | +STATE_CNT = 9 |
| 13 | + |
| 14 | +# The transition table. |
| 15 | +# A value X on column Y means state Y should transition to state X on some |
| 16 | +# input bytes. We assign state 0 as ERROR and state 1 as ACCEPT (initial). |
| 17 | +# Eg. first line: for input byte 00..=7F, transition S1 -> S1, others -> S0. |
| 18 | +TRANSITIONS = [ |
| 19 | + # 0 1 2 3 4 5 6 7 8 |
| 20 | + # First bytes |
| 21 | + ((0, 1, 0, 0, 0, 0, 0, 0, 0), "00-7F"), |
| 22 | + ((0, 2, 0, 0, 0, 0, 0, 0, 0), "C2-DF"), |
| 23 | + ((0, 3, 0, 0, 0, 0, 0, 0, 0), "E0"), |
| 24 | + ((0, 4, 0, 0, 0, 0, 0, 0, 0), "E1-EC, EE-EF"), |
| 25 | + ((0, 5, 0, 0, 0, 0, 0, 0, 0), "ED"), |
| 26 | + ((0, 6, 0, 0, 0, 0, 0, 0, 0), "F0"), |
| 27 | + ((0, 7, 0, 0, 0, 0, 0, 0, 0), "F1-F3"), |
| 28 | + ((0, 8, 0, 0, 0, 0, 0, 0, 0), "F4"), |
| 29 | + # Continuation bytes |
| 30 | + ((0, 0, 1, 0, 2, 2, 0, 4, 4), "80-8F"), |
| 31 | + ((0, 0, 1, 0, 2, 2, 4, 4, 0), "90-9F"), |
| 32 | + ((0, 0, 1, 2, 2, 0, 4, 4, 0), "A0-BF"), |
| 33 | + # Illegal |
| 34 | + ((0, 0, 0, 0, 0, 0, 0, 0, 0), "C0-C1, F5-FF"), |
| 35 | +] |
| 36 | + |
| 37 | +o = Optimize() |
| 38 | +offsets = [BitVec(f"o{i}", 32) for i in range(STATE_CNT)] |
| 39 | +trans_table = [BitVec(f"t{i}", 32) for i in range(len(TRANSITIONS))] |
| 40 | + |
| 41 | +# Add some guiding constraints to make solving faster. |
| 42 | +o.add(offsets[0] == 0) |
| 43 | +o.add(trans_table[-1] == 0) |
| 44 | + |
| 45 | +for i in range(len(offsets)): |
| 46 | + # Do not over-shift. It's not necessary but makes solving faster. |
| 47 | + o.add(offsets[i] < 32 - 5) |
| 48 | + for j in range(i): |
| 49 | + o.add(offsets[i] != offsets[j]) |
| 50 | +for trans, (targets, _) in zip(trans_table, TRANSITIONS): |
| 51 | + for src, tgt in enumerate(targets): |
| 52 | + o.add((LShR(trans, offsets[src]) & 31) == offsets[tgt]) |
| 53 | + |
| 54 | +# Minimize ordered outputs to get a unique solution. |
| 55 | +goal = Concat(*offsets, *trans_table) |
| 56 | +o.minimize(goal) |
| 57 | +print(o.check()) |
| 58 | +print("Offset[]= ", [o.model()[i].as_long() for i in offsets]) |
| 59 | +print("Transitions:") |
| 60 | +for (_, label), v in zip(TRANSITIONS, [o.model()[i].as_long() for i in trans_table]): |
| 61 | + print(f"{label:14} => {v:#10x}, // {v:032b}") |
| 62 | + |
| 63 | +# Output should be deterministic: |
| 64 | +# sat |
| 65 | +# Offset[]= [0, 6, 16, 19, 1, 25, 11, 18, 24] |
| 66 | +# Transitions: |
| 67 | +# 00-7F => 0x180, // 00000000000000000000000110000000 |
| 68 | +# C2-DF => 0x400, // 00000000000000000000010000000000 |
| 69 | +# E0 => 0x4c0, // 00000000000000000000010011000000 |
| 70 | +# E1-EC, EE-EF => 0x40, // 00000000000000000000000001000000 |
| 71 | +# ED => 0x640, // 00000000000000000000011001000000 |
| 72 | +# F0 => 0x2c0, // 00000000000000000000001011000000 |
| 73 | +# F1-F3 => 0x480, // 00000000000000000000010010000000 |
| 74 | +# F4 => 0x600, // 00000000000000000000011000000000 |
| 75 | +# 80-8F => 0x21060020, // 00100001000001100000000000100000 |
| 76 | +# 90-9F => 0x20060820, // 00100000000001100000100000100000 |
| 77 | +# A0-BF => 0x860820, // 00000000100001100000100000100000 |
| 78 | +# C0-C1, F5-FF => 0x0, // 00000000000000000000000000000000 |
0 commit comments