Skip to content

Commit f0c9f1b

Browse files
Support Unicode 16 variation seqs for quotation mark width
1 parent 6ab41d7 commit f0c9f1b

File tree

4 files changed

+196
-33
lines changed

4 files changed

+196
-33
lines changed

scripts/unicode.py

+80-17
Original file line numberDiff line numberDiff line change
@@ -175,8 +175,11 @@ class WidthState(enum.IntEnum):
175175
- 4th bit: whether to set top bit on emoji presentation.
176176
If this is set but 3rd is not, the width mode is related to zwj sequences
177177
- 5th from top: whether this is unaffected by ligature-transparent
178+
(if set, should also set 3rd and 4th)
178179
- 6th bit: if 4th is set but this one is not, then this is a ZWJ ligature state
179-
where no ZWJ has been encountered yet; encountering one flips this on"""
180+
where no ZWJ has been encountered yet; encountering one flips this on
181+
- Seventh bit: is VS1 (if CJK) or is VS2 (not CJK)
182+
"""
180183

181184
# BASIC WIDTHS
182185

@@ -272,6 +275,9 @@ class WidthState(enum.IntEnum):
272275

273276
# VARIATION SELECTORS
274277

278+
VARIATION_SELECTOR_1_OR_2 = 0b0000_0010_0000_0000
279+
"\\uFE00 if CJK, or \\uFE01 otherwise"
280+
275281
# Text presentation sequences (not CJK)
276282
VARIATION_SELECTOR_15 = 0b0100_0000_0000_0000
277283
"\\uFE0E (text presentation sequences)"
@@ -367,6 +373,7 @@ def width_alone(self) -> int:
367373
| WidthState.COMBINING_LONG_SOLIDUS_OVERLAY
368374
| WidthState.VARIATION_SELECTOR_15
369375
| WidthState.VARIATION_SELECTOR_16
376+
| WidthState.VARIATION_SELECTOR_1_OR_2
370377
):
371378
return 0
372379
case (
@@ -656,9 +663,11 @@ def load_width_maps() -> tuple[list[WidthState], list[WidthState]]:
656663
ea[cp] = width
657664

658665
# East-Asian only
666+
ea[0xFE00] = WidthState.VARIATION_SELECTOR_1_OR_2
659667
ea[0x0338] = WidthState.COMBINING_LONG_SOLIDUS_OVERLAY
660668

661669
# Not East Asian only
670+
not_ea[0xFE01] = WidthState.VARIATION_SELECTOR_1_OR_2
662671
not_ea[0xFE0E] = WidthState.VARIATION_SELECTOR_15
663672

664673
return (not_ea, ea)
@@ -724,7 +733,7 @@ def load_solidus_transparent(
724733
cjk_width_map: list[WidthState],
725734
) -> list[tuple[Codepoint, Codepoint]]:
726735
"""Characters expanding to a canonical combining class above 1, plus `ligature_transparent`s from above.
727-
Ranges matching ones in `ligature_transparent` exactly are excluded (for compression), so it needs to bechecked also.
736+
Ranges matching ones in `ligature_transparent` exactly are excluded (for compression), so it needs to be checked also.
728737
"""
729738

730739
ccc_above_1 = set()
@@ -756,7 +765,7 @@ def load_solidus_transparent(
756765
num_chars = len(ccc_above_1)
757766

758767
for cp in ccc_above_1:
759-
if cp != 0xFE0F:
768+
if cp not in [0xFE00, 0xFE0F]:
760769
assert (
761770
cjk_width_map[cp].table_width() != CharWidthInTable.SPECIAL
762771
), f"U+{cp:X}"
@@ -1312,8 +1321,17 @@ def lookup_fns(
13121321
return (0, next_info.set_emoji_presentation());
13131322
}"""
13141323

1315-
if not is_cjk:
1324+
if is_cjk:
1325+
s += """
1326+
if c == '\\u{FE00}' {
1327+
return (0, next_info.set_vs1_2());
1328+
}
1329+
"""
1330+
else:
13161331
s += """
1332+
if c == '\\u{FE01}' {
1333+
return (0, next_info.set_vs1_2());
1334+
}
13171335
if c == '\\u{FE0E}' {
13181336
return (0, next_info.set_text_presentation());
13191337
}
@@ -1323,9 +1341,19 @@ def lookup_fns(
13231341
} else {
13241342
next_info = next_info.unset_text_presentation();
13251343
}
1326-
}"""
1344+
} else """
13271345

1328-
s += """
1346+
s += """if next_info.is_vs1_2() {
1347+
if matches!(c, '\\u{2018}' | '\\u{2019}' | '\\u{201C}' | '\\u{201D}') {
1348+
return ("""
1349+
1350+
s += str(2 - is_cjk)
1351+
1352+
s += """, WidthInfo::DEFAULT);
1353+
} else {
1354+
next_info = next_info.unset_vs1_2();
1355+
}
1356+
}
13291357
if next_info.is_ligature_transparent() {
13301358
if c == '\\u{200D}' {
13311359
return (0, next_info.set_zwj_bit());
@@ -1586,6 +1614,8 @@ def emit_module(
15861614
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
15871615
struct WidthInfo(u16);
15881616
1617+
const LIGATURE_TRANSPARENT_MASK: u16 = 0b0010_0000_0000_0000;
1618+
15891619
impl WidthInfo {
15901620
/// No special handling necessary
15911621
const DEFAULT: Self = Self(0);
@@ -1615,51 +1645,84 @@ def emit_module(
16151645
16161646
/// Has top bit set
16171647
fn is_emoji_presentation(self) -> bool {{
1618-
(self.0 & 0b1000_0000_0000_0000) == 0b1000_0000_0000_0000
1648+
(self.0 & WidthInfo::VARIATION_SELECTOR_16.0) == WidthInfo::VARIATION_SELECTOR_16.0
16191649
}}
16201650
1621-
/// Has top bit set
16221651
fn is_zwj_emoji_presentation(self) -> bool {{
16231652
(self.0 & 0b1011_0000_0000_0000) == 0b1001_0000_0000_0000
16241653
}}
16251654
16261655
/// Set top bit
16271656
fn set_emoji_presentation(self) -> Self {{
1628-
if (self.0 & 0b0010_0000_0000_0000) == 0b0010_0000_0000_0000
1657+
if (self.0 & LIGATURE_TRANSPARENT_MASK) == LIGATURE_TRANSPARENT_MASK
16291658
|| (self.0 & 0b1001_0000_0000_0000) == 0b0001_0000_0000_0000
16301659
{{
1631-
Self(self.0 | 0b1000_0000_0000_0000)
1660+
Self(
1661+
self.0
1662+
| WidthInfo::VARIATION_SELECTOR_16.0
1663+
& !WidthInfo::VARIATION_SELECTOR_15.0
1664+
& !WidthInfo::VARIATION_SELECTOR_1_OR_2.0,
1665+
)
16321666
}} else {{
16331667
Self::VARIATION_SELECTOR_16
16341668
}}
16351669
}}
16361670
16371671
/// Clear top bit
16381672
fn unset_emoji_presentation(self) -> Self {{
1639-
if (self.0 & 0b0010_0000_0000_0000) == 0b0010_0000_0000_0000 {{
1640-
Self(self.0 & 0b0111_1111_1111_1111)
1673+
if (self.0 & LIGATURE_TRANSPARENT_MASK) == LIGATURE_TRANSPARENT_MASK {{
1674+
Self(self.0 & !WidthInfo::VARIATION_SELECTOR_16.0)
16411675
}} else {{
16421676
Self::DEFAULT
16431677
}}
16441678
}}
16451679
16461680
/// Has 2nd bit set
16471681
fn is_text_presentation(self) -> bool {{
1648-
(self.0 & 0b0100_0000_0000_0000) == 0b0100_0000_0000_0000
1682+
(self.0 & WidthInfo::VARIATION_SELECTOR_15.0) == WidthInfo::VARIATION_SELECTOR_15.0
16491683
}}
16501684
16511685
/// Set 2nd bit
16521686
fn set_text_presentation(self) -> Self {{
1653-
if (self.0 & 0b0010_0000_0000_0000) == 0b0010_0000_0000_0000 {{
1654-
Self(self.0 | 0b0100_0000_0000_0000)
1687+
if (self.0 & LIGATURE_TRANSPARENT_MASK) == LIGATURE_TRANSPARENT_MASK {{
1688+
Self(
1689+
self.0
1690+
| WidthInfo::VARIATION_SELECTOR_15.0
1691+
& !WidthInfo::VARIATION_SELECTOR_16.0
1692+
& !WidthInfo::VARIATION_SELECTOR_1_OR_2.0,
1693+
)
16551694
}} else {{
1656-
Self(0b0100_0000_0000_0000)
1695+
Self(WidthInfo::VARIATION_SELECTOR_15.0)
16571696
}}
16581697
}}
16591698
16601699
/// Clear 2nd bit
16611700
fn unset_text_presentation(self) -> Self {{
1662-
Self(self.0 & 0b1011_1111_1111_1111)
1701+
Self(self.0 & !WidthInfo::VARIATION_SELECTOR_15.0)
1702+
}}
1703+
1704+
/// Has 7th bit set
1705+
fn is_vs1_2(self) -> bool {{
1706+
(self.0 & WidthInfo::VARIATION_SELECTOR_1_OR_2.0) == WidthInfo::VARIATION_SELECTOR_1_OR_2.0
1707+
}}
1708+
1709+
/// Set 7th bit
1710+
fn set_vs1_2(self) -> Self {{
1711+
if (self.0 & LIGATURE_TRANSPARENT_MASK) == LIGATURE_TRANSPARENT_MASK {{
1712+
Self(
1713+
self.0
1714+
| WidthInfo::VARIATION_SELECTOR_1_OR_2.0
1715+
& !WidthInfo::VARIATION_SELECTOR_15.0
1716+
& !WidthInfo::VARIATION_SELECTOR_16.0,
1717+
)
1718+
}} else {{
1719+
Self(WidthInfo::VARIATION_SELECTOR_1_OR_2.0)
1720+
}}
1721+
}}
1722+
1723+
/// Clear 7th bit
1724+
fn unset_vs1_2(self) -> Self {{
1725+
Self(self.0 & !WidthInfo::VARIATION_SELECTOR_1_OR_2.0)
16631726
}}
16641727
}}
16651728

src/lib.rs

+4-1
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,8 @@
6262
//! - Outside of an East Asian context, [text presentation sequences] have width 1 if their base character:
6363
//! - Has the [`Emoji_Presentation`] property, and
6464
//! - Is not in the [Enclosed Ideographic Supplement] block.
65+
//! - [`'\u{2018}'`, `'\u{2019}'`, `'\u{201C}'`, and `'\u{201D}'`][General Punctuation] always have width 1 when followed by '\u{FE00}',
66+
//! and width 2 when followed by '\u{FE01}'.
6567
//! - Script-specific ligatures:
6668
//! - For all the following ligatures, the insertion of any number of [default-ignorable][`Default_Ignorable_Code_Point`]
6769
//! [combining marks] anywhere in the sequence will not change the total width. In addition, for all non-Arabic
@@ -75,7 +77,7 @@
7577
//! - **[Khmer]**: Coeng signs consisting of `'\u{17D2}'` followed by a character in
7678
//! `'\u{1780}'..='\u{1782}' | '\u{1784}'..='\u{1787}' | '\u{1789}'..='\u{178C}' | '\u{178E}'..='\u{1793}' | '\u{1795}'..='\u{1798}' | '\u{179B}'..='\u{179D}' | '\u{17A0}' | '\u{17A2}' | '\u{17A7}' | '\u{17AB}'..='\u{17AC}' | '\u{17AF}'`
7779
//! have width 0.
78-
//! - **[Kirat Rai]**: Any sequence canonically equivalent to `\u{16D68}`, `\u{16D69}`, or `\u{16D6A}` has total width 1.
80+
//! - **[Kirat Rai]**: Any sequence canonically equivalent to `'\u{16D68}'`, `'\u{16D69}'`, or `'\u{16D6A}'` has total width 1.
7981
//! - **[Lisu]**: Tone letter combinations consisting of a character in the range `'\u{A4F8}'..='\u{A4FB}'`
8082
//! followed by a character in the range `'\u{A4FC}'..='\u{A4FD}'` have width 1. For example: `ꓹꓼ`
8183
//! - **[Old Turkic]**: `"\u{10C32}\u{200D}\u{10C03}"` (`𐰲‍𐰃`) has total width 1.
@@ -158,6 +160,7 @@
158160
//! [Emoji presentation sequences]: https://unicode.org/reports/tr51/#def_emoji_presentation_sequence
159161
//! [text presentation sequences]: https://unicode.org/reports/tr51/#def_text_presentation_sequence
160162
//!
163+
//! [General Punctuation]: https://www.unicode.org/charts/PDF/Unicode-16.0/U160-2000.pdf
161164
//! [Enclosed Ideographic Supplement]: https://unicode.org/charts/nameslist/n_1F200.html
162165
//!
163166
//! [Arabic]: https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-9/#G7480

0 commit comments

Comments
 (0)