1
+ import unicodedata
1
2
from typing import Optional , Set
2
3
3
4
from ..corpora .usfm_token import UsfmToken
6
7
7
8
class TextSegment :
8
9
def __init__ (self ):
9
- self ._text = ""
10
+ self ._text : GraphemeString = GraphemeString ( "" )
10
11
self ._immediate_preceding_marker : UsfmMarkerType = UsfmMarkerType .NO_MARKER
11
12
self ._markers_in_preceding_context : Set [UsfmMarkerType ] = set ()
12
13
self .previous_segment : Optional [TextSegment ] = None
@@ -31,18 +32,18 @@ def __eq__(self, value):
31
32
return True
32
33
33
34
@property
34
- def text (self ) -> str :
35
+ def text (self ) -> "GraphemeString" :
35
36
return self ._text
36
37
37
38
@property
38
39
def length (self ) -> int :
39
40
return len (self ._text )
40
41
41
42
def substring_before (self , index : int ) -> str :
42
- return self ._text [:index ]
43
+ return self ._text [:index ]. string
43
44
44
45
def substring_after (self , index : int ) -> str :
45
- return self ._text [index :]
46
+ return self ._text [index :]. string
46
47
47
48
def marker_is_in_preceding_context (self , marker : UsfmMarkerType ) -> bool :
48
49
return marker in self ._markers_in_preceding_context
@@ -54,9 +55,9 @@ def is_last_segment_in_verse(self) -> bool:
54
55
return self .index_in_verse == self .num_segments_in_verse - 1
55
56
56
57
def replace_substring (self , start_index : int , end_index : int , replacement : str ) -> None :
57
- self ._text = self .substring_before (start_index ) + replacement + self .substring_after (end_index )
58
+ self ._text = GraphemeString ( self .substring_before (start_index ) + replacement + self .substring_after (end_index ) )
58
59
if self ._usfm_token is not None :
59
- self ._usfm_token .text = self ._text
60
+ self ._usfm_token .text = self ._text . string
60
61
61
62
class Builder :
62
63
def __init__ (self ):
@@ -76,8 +77,74 @@ def set_usfm_token(self, token: UsfmToken) -> "TextSegment.Builder":
76
77
return self
77
78
78
79
def set_text (self , text : str ) -> "TextSegment.Builder" :
79
- self ._text_segment ._text = text
80
+ self ._text_segment ._text = GraphemeString ( text )
80
81
return self
81
82
82
83
def build (self ) -> "TextSegment" :
83
84
return self ._text_segment
85
+
86
+
87
+ class GraphemeString :
88
+ def __init__ (self , string : str ) -> None :
89
+ self ._string = string
90
+ self ._string_index_by_grapheme_index = {
91
+ grapheme_index : string_index
92
+ for grapheme_index , string_index in enumerate (
93
+ [i for i , c in enumerate (string ) if unicodedata .category (c ) not in ["Mc" , "Mn" ]]
94
+ )
95
+ }
96
+
97
+ def __len__ (self ) -> int :
98
+ return len (self ._string_index_by_grapheme_index )
99
+
100
+ @property
101
+ def string (self ) -> str :
102
+ return self ._string
103
+
104
+ def __str__ (self ):
105
+ return self ._string
106
+
107
+ def __eq__ (self , other ) -> bool :
108
+ if not isinstance (other , GraphemeString ):
109
+ return False
110
+ return self ._string == other .string
111
+
112
+ def __getitem__ (self , key ) -> "GraphemeString" :
113
+ if isinstance (key , int ):
114
+ grapheme_start = self ._normalize_start_index (key )
115
+ grapheme_stop = self ._normalize_stop_index (grapheme_start + 1 )
116
+ string_start = self ._string_index_by_grapheme_index .get (grapheme_start , len (self ))
117
+ string_stop = self ._string_index_by_grapheme_index .get (grapheme_stop , None )
118
+ return GraphemeString (self ._string [string_start :string_stop ])
119
+ elif isinstance (key , slice ):
120
+ if key .step is not None and key .step != 1 :
121
+ raise TypeError ("Steps are not allowed in _GraphemeString slices" )
122
+ grapheme_start = self ._normalize_start_index (key .start )
123
+ grapheme_stop = self ._normalize_stop_index (key .stop )
124
+ string_start = self ._string_index_by_grapheme_index .get (grapheme_start , len (self ))
125
+ string_stop = self ._string_index_by_grapheme_index .get (grapheme_stop , None )
126
+ return GraphemeString (self ._string [string_start :string_stop ])
127
+ else :
128
+ raise TypeError ("Indices must be integers or slices" )
129
+
130
+ def _normalize_start_index (self , index : int | None ) -> int :
131
+ if index is None :
132
+ return 0
133
+ if index < 0 :
134
+ return len (self ) + index
135
+ return index
136
+
137
+ def _normalize_stop_index (self , index : int | None ) -> int :
138
+ if index is None :
139
+ return len (self )
140
+ if index < 0 :
141
+ return len (self ) + index
142
+ return index
143
+
144
+ def string_index_to_grapheme_index (self , string_index : int ) -> int :
145
+ if string_index == len (self ._string ):
146
+ return len (self )
147
+ for g_index , s_index in self ._string_index_by_grapheme_index .items ():
148
+ if s_index == string_index :
149
+ return g_index
150
+ raise ValueError (f"No corresponding grapheme index found for string index { string_index } ." )
0 commit comments