Skip to content

Commit 00cf1da

Browse files
committed
Apply PRs google#350 and google#257: CJK tokenizer fix + retry mechanism
PR google#350 — Fix incorrect char_interval for non-ASCII text: - Add _CJK_SCRIPTS constant for Han, Hiragana, Katakana, Hangul detection - Modify _LETTERS_PATTERN with regex V1 set subtraction to exclude CJK - Add _CJK_PATTERN for standalone CJK character matching - Update _TOKEN_PATTERN and _WORD_PATTERN with flags=regex.V1 - Fix trailing whitespace in japanese_extraction.md example PR google#257 — Add retry mechanism for transient API errors: - New retry_utils.py: is_transient_error(), retry_on_transient_errors(), retry_chunk_processing() decorators with exponential backoff + jitter - annotation.py: _process_batch_with_retry(), retry params threaded through annotate_documents/text and single/sequential pass methods - extraction.py: retry params in extract() signature, passed via retry_kwargs - gemini.py: @retry_chunk_processing() decorator on _process_single_prompt - New retry_utils_test.py + AnnotatorRetryPolicyTest in annotation_test.py Upstream: google#350, google#257 Fixes: google#334
1 parent 5a05b0d commit 00cf1da

File tree

8 files changed

+2594
-1831
lines changed

8 files changed

+2594
-1831
lines changed

docs/examples/japanese_extraction.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ for entity in result.extractions:
5151
if entity.char_interval:
5252
start, end = entity.char_interval.start_pos, entity.char_interval.end_pos
5353
position_info = f" (pos: {start}-{end})"
54-
54+
5555
print(f"{entity.extraction_class}: {entity.extraction_text}{position_info}")
5656

5757
# Expected Output:

langextract/annotation.py

Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -206,6 +206,139 @@ def __init__(
206206
"Annotator initialized with format_handler: %s", format_handler
207207
)
208208

209+
def _process_batch_with_retry(
210+
self,
211+
batch_prompts: list[str],
212+
batch: list[chunking.TextChunk],
213+
retry_transient_errors: bool = True,
214+
max_retries: int = 3,
215+
retry_initial_delay: float = 1.0,
216+
retry_backoff_factor: float = 2.0,
217+
retry_max_delay: float = 60.0,
218+
**kwargs,
219+
) -> Iterator[list[core_types.ScoredOutput]]:
220+
"""Process a batch of prompts with individual chunk retry capability.
221+
222+
This method processes each chunk individually and retries failed chunks
223+
due to transient errors (like 503 "model overloaded") while preserving
224+
successful chunks from the same batch.
225+
226+
Args:
227+
batch_prompts: List of prompts for the batch
228+
batch: List of TextChunk objects corresponding to the prompts
229+
retry_transient_errors: Whether to retry on transient errors
230+
max_retries: Maximum number of retry attempts
231+
retry_initial_delay: Initial delay before retry
232+
retry_backoff_factor: Backoff multiplier for retries
233+
retry_max_delay: Maximum delay between retries
234+
**kwargs: Additional arguments passed to the language model
235+
236+
Yields:
237+
Lists of ScoredOutputs, with retries for failed chunks
238+
"""
239+
try:
240+
batch_results = list(
241+
self._language_model.infer(
242+
batch_prompts=batch_prompts,
243+
**kwargs,
244+
)
245+
)
246+
247+
yield from batch_results
248+
return
249+
250+
except Exception as e:
251+
if not retry_utils.is_transient_error(e):
252+
raise
253+
254+
logging.warning(
255+
"Batch processing failed with transient error: %s. "
256+
"Falling back to individual chunk processing with retry.",
257+
str(e),
258+
)
259+
260+
individual_results = []
261+
262+
for i, (prompt, chunk) in enumerate(zip(batch_prompts, batch)):
263+
try:
264+
chunk_result = self._process_single_chunk_with_retry(
265+
prompt=prompt,
266+
chunk=chunk,
267+
retry_transient_errors=retry_transient_errors,
268+
max_retries=max_retries,
269+
retry_initial_delay=retry_initial_delay,
270+
retry_backoff_factor=retry_backoff_factor,
271+
retry_max_delay=retry_max_delay,
272+
**kwargs,
273+
)
274+
individual_results.append(chunk_result)
275+
276+
except Exception as e:
277+
logging.error(
278+
"Failed to process chunk %d after retries: %s. "
279+
"Chunk info: document_id=%s, text_length=%d. "
280+
"Stopping document processing.",
281+
i,
282+
str(e),
283+
chunk.document_id,
284+
len(chunk.chunk_text),
285+
)
286+
raise
287+
288+
yield from individual_results
289+
290+
def _process_single_chunk_with_retry(
291+
self,
292+
prompt: str,
293+
chunk: chunking.TextChunk,
294+
retry_transient_errors: bool = True,
295+
max_retries: int = 3,
296+
retry_initial_delay: float = 1.0,
297+
retry_backoff_factor: float = 2.0,
298+
retry_max_delay: float = 60.0,
299+
**kwargs,
300+
) -> list[core_types.ScoredOutput]:
301+
"""Process a single chunk with retry logic.
302+
303+
Args:
304+
prompt: The prompt for this chunk
305+
chunk: The TextChunk object
306+
retry_transient_errors: Whether to retry on transient errors
307+
max_retries: Maximum number of retry attempts
308+
retry_initial_delay: Initial delay before retry
309+
retry_backoff_factor: Backoff multiplier for retries
310+
retry_max_delay: Maximum delay between retries
311+
**kwargs: Additional arguments for the language model
312+
313+
Returns:
314+
List containing a single ScoredOutput for this chunk
315+
"""
316+
317+
# Use the retry decorator with custom parameters
318+
@retry_utils.retry_chunk_processing(
319+
max_retries=max_retries,
320+
initial_delay=retry_initial_delay,
321+
backoff_factor=retry_backoff_factor,
322+
max_delay=retry_max_delay,
323+
enabled=retry_transient_errors,
324+
)
325+
def _process_chunk():
326+
batch_results = list(
327+
self._language_model.infer(
328+
batch_prompts=[prompt],
329+
**kwargs,
330+
)
331+
)
332+
333+
if not batch_results:
334+
raise exceptions.InferenceOutputError(
335+
f"No results returned for chunk in document {chunk.document_id}"
336+
)
337+
338+
return batch_results[0]
339+
340+
return _process_chunk()
341+
209342
def annotate_documents(
210343
self,
211344
documents: Iterable[data.Document],

0 commit comments

Comments
 (0)