From 533d5ba8eef5f31a99b55a165206dc18907c1a28 Mon Sep 17 00:00:00 2001 From: younies Date: Thu, 20 Nov 2025 20:59:32 +0100 Subject: [PATCH 1/3] ICU-23264 Add isAvailable method to MeasureUnit for efficient unit validation This new method checks if a specific unit is available for a given type and subtype, improving performance by avoiding the need to retrieve all available units. Updated parseMeasureUnitOption to utilize this method for better error handling when validating measure units. --- icu4c/source/i18n/measunit.cpp | 18 ++++++++++++++++++ icu4c/source/i18n/number_skeletons.cpp | 23 +++++------------------ icu4c/source/i18n/unicode/measunit.h | 16 ++++++++++++++++ 3 files changed, 39 insertions(+), 18 deletions(-) diff --git a/icu4c/source/i18n/measunit.cpp b/icu4c/source/i18n/measunit.cpp index 088c136bcf0d..04b648748c0e 100644 --- a/icu4c/source/i18n/measunit.cpp +++ b/icu4c/source/i18n/measunit.cpp @@ -2729,6 +2729,24 @@ StringEnumeration* MeasureUnit::getAvailableTypes(UErrorCode &errorCode) { return result; } +bool MeasureUnit::validateAndGet(StringPiece type, StringPiece subtype, MeasureUnit &result) { + // Find the type index using binary search + int32_t typeIdx = binarySearch(gTypes, 0, UPRV_LENGTHOF(gTypes), type); + if (typeIdx == -1) { + return false; // Type not found + } + + // Find the subtype within the type's range using binary search + int32_t subtypeIdx = binarySearch(gSubTypes, gOffsets[typeIdx], gOffsets[typeIdx + 1], subtype); + if (subtypeIdx == -1) { + return false; // Subtype not found + } + + // Create the MeasureUnit and return it + result.setTo(typeIdx, subtypeIdx - gOffsets[typeIdx]); + return true; +} + bool MeasureUnit::findBySubType(StringPiece subType, MeasureUnit* output) { // Sanity checking kCurrencyOffset and final entry in gOffsets U_ASSERT(uprv_strcmp(gTypes[kCurrencyOffset], "currency") == 0); diff --git a/icu4c/source/i18n/number_skeletons.cpp b/icu4c/source/i18n/number_skeletons.cpp index 67a38dad0737..1d21d805a0e2 100644 --- a/icu4c/source/i18n/number_skeletons.cpp +++ b/icu4c/source/i18n/number_skeletons.cpp @@ -1072,25 +1072,12 @@ void blueprint_helpers::parseMeasureUnitOption(const StringSegment& segment, Mac CharString subType; SKELETON_UCHAR_TO_CHAR(subType, stemString, firstHyphen + 1, stemString.length(), status); - // Note: the largest type as of this writing (Aug 2020) is "volume", which has 33 units. - static constexpr int32_t CAPACITY = 40; - MeasureUnit units[CAPACITY]; - UErrorCode localStatus = U_ZERO_ERROR; - int32_t numUnits = MeasureUnit::getAvailable(type.data(), units, CAPACITY, localStatus); - if (U_FAILURE(localStatus)) { - // More than 30 units in this type? - status = U_INTERNAL_PROGRAM_ERROR; + MeasureUnit unit; + if (MeasureUnit::validateAndGet(type.toStringPiece(), subType.toStringPiece(), unit)) { + macros.unit = unit; return; - } - for (int32_t i = 0; i < numUnits; i++) { - auto& unit = units[i]; - if (uprv_strcmp(subType.data(), unit.getSubtype()) == 0) { - macros.unit = unit; - return; - } - } - - // throw new SkeletonSyntaxException("Unknown measure unit", segment); + } + status = U_NUMBER_SKELETON_SYNTAX_ERROR; } diff --git a/icu4c/source/i18n/unicode/measunit.h b/icu4c/source/i18n/unicode/measunit.h index b5f3a2986f74..d8c3e7c20027 100644 --- a/icu4c/source/i18n/unicode/measunit.h +++ b/icu4c/source/i18n/unicode/measunit.h @@ -720,6 +720,22 @@ class U_I18N_API MeasureUnit: public UObject { */ static StringEnumeration* getAvailableTypes(UErrorCode &errorCode); +#ifndef U_HIDE_INTERNAL_API + /** + * Validates that a specific type and subtype combination exists and retrieve the unit. + * + *

Note: This is more efficient than calling getAvailable() when you only need + * to validate and retrieve a single unit. + * + * @param type the unit type (e.g., "length", "mass", "volume") + * @param subtype the unit subtype (e.g., "meter", "kilogram", "liter") + * @param result if the unit is valid, this will be set to the MeasureUnit + * @return true if the type/subtype combination is valid, false otherwise + * @internal + */ + static bool validateAndGet(StringPiece type, StringPiece subtype, MeasureUnit &result); +#endif /* U_HIDE_INTERNAL_API */ + /** * Return the class ID for this class. This is useful only for comparing to * a return value from getDynamicClassID(). For example: From 3682dbd05cb031d77cd565c61de25d14898b53b2 Mon Sep 17 00:00:00 2001 From: younies Date: Tue, 25 Nov 2025 00:34:05 +0100 Subject: [PATCH 2/3] improve c++ and java too --- icu4c/source/i18n/measunit.cpp | 32 +++++++++++++++---- .../ibm/icu/number/NumberSkeletonImpl.java | 10 +++--- .../java/com/ibm/icu/util/MeasureUnit.java | 23 +++++++++++++ 3 files changed, 53 insertions(+), 12 deletions(-) diff --git a/icu4c/source/i18n/measunit.cpp b/icu4c/source/i18n/measunit.cpp index 04b648748c0e..965e47fe956e 100644 --- a/icu4c/source/i18n/measunit.cpp +++ b/icu4c/source/i18n/measunit.cpp @@ -2563,6 +2563,18 @@ static int32_t binarySearch( return -1; } +/** + * Helper function to get the subtype range for a given type index. + * @param typeIdx the type index (0 to UPRV_LENGTHOF(gTypes)-1) + * @param start will be set to the starting index in gSubTypes + * @param end will be set to the ending index in gSubTypes (exclusive) + */ +static void getSubtypeRange(int32_t typeIdx, int32_t &start, int32_t &end) { + U_ASSERT(typeIdx >= 0 && typeIdx < UPRV_LENGTHOF(gTypes)); + start = gOffsets[typeIdx]; + end = gOffsets[typeIdx + 1]; +} + MeasureUnit::MeasureUnit() : MeasureUnit(kBaseTypeIdx, kBaseSubTypeIdx) { } @@ -2680,7 +2692,9 @@ int32_t MeasureUnit::getAvailable( } int32_t idx = 0; for (int32_t typeIdx = 0; typeIdx < UPRV_LENGTHOF(gTypes); ++typeIdx) { - int32_t len = gOffsets[typeIdx + 1] - gOffsets[typeIdx]; + int32_t start, end; + getSubtypeRange(typeIdx, start, end); + int32_t len = end - start; for (int32_t subTypeIdx = 0; subTypeIdx < len; ++subTypeIdx) { dest[idx].setTo(typeIdx, subTypeIdx); ++idx; @@ -2702,7 +2716,9 @@ int32_t MeasureUnit::getAvailable( if (typeIdx == -1) { return 0; } - int32_t len = gOffsets[typeIdx + 1] - gOffsets[typeIdx]; + int32_t start, end; + getSubtypeRange(typeIdx, start, end); + int32_t len = end - start; if (destCapacity < len) { errorCode = U_BUFFER_OVERFLOW_ERROR; return len; @@ -2737,13 +2753,15 @@ bool MeasureUnit::validateAndGet(StringPiece type, StringPiece subtype, MeasureU } // Find the subtype within the type's range using binary search - int32_t subtypeIdx = binarySearch(gSubTypes, gOffsets[typeIdx], gOffsets[typeIdx + 1], subtype); + int32_t start, end; + getSubtypeRange(typeIdx, start, end); + int32_t subtypeIdx = binarySearch(gSubTypes, start, end, subtype); if (subtypeIdx == -1) { return false; // Subtype not found } // Create the MeasureUnit and return it - result.setTo(typeIdx, subtypeIdx - gOffsets[typeIdx]); + result.setTo(typeIdx, subtypeIdx - start); return true; } @@ -2757,9 +2775,11 @@ bool MeasureUnit::findBySubType(StringPiece subType, MeasureUnit* output) { if (t == kCurrencyOffset) { continue; } - int32_t st = binarySearch(gSubTypes, gOffsets[t], gOffsets[t + 1], subType); + int32_t start, end; + getSubtypeRange(t, start, end); + int32_t st = binarySearch(gSubTypes, start, end, subType); if (st >= 0) { - output->setTo(t, st - gOffsets[t]); + output->setTo(t, st - start); return true; } } diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/number/NumberSkeletonImpl.java b/icu4j/main/core/src/main/java/com/ibm/icu/number/NumberSkeletonImpl.java index 9cefd3860288..e3a31d7b660c 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/number/NumberSkeletonImpl.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/number/NumberSkeletonImpl.java @@ -1095,12 +1095,10 @@ private static void parseMeasureUnitOption(StringSegment segment, MacroProps mac } String type = segment.subSequence(0, firstHyphen).toString(); String subType = segment.subSequence(firstHyphen + 1, segment.length()).toString(); - Set units = MeasureUnit.getAvailable(type); - for (MeasureUnit unit : units) { - if (subType.equals(unit.getSubtype())) { - macros.unit = unit; - return; - } + MeasureUnit unit = MeasureUnit.getUnit(type, subType); + if (unit != null) { + macros.unit = unit; + return; } throw new SkeletonSyntaxException("Unknown measure unit", segment); } diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/util/MeasureUnit.java b/icu4j/main/core/src/main/java/com/ibm/icu/util/MeasureUnit.java index 634e306dc244..5644cb1db69b 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/util/MeasureUnit.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/util/MeasureUnit.java @@ -870,6 +870,29 @@ public static MeasureUnit findBySubType(String subType) { return null; } + /** + * Returns the MeasureUnit instance if the given type and subtype combination is + * valid, + * or null otherwise. + * + * Example: + * "length", "meter" -> METER + * "length", "kilometer" -> KILOMETER + * "length", "kilometer-per-hour" -> null --> not valid + * + * @param type the unit type (e.g., "length", "mass", "volume") + * @param subtype the unit subtype (e.g., "meter", "kilogram", "liter") + * @return the MeasureUnit if valid, otherwise null + * @internal + * @deprecated This API is ICU internal only. + */ + @Deprecated + public static MeasureUnit getUnit(String type, String subtype) { + populateCache(); + Map units = cache.get(type); + return units != null ? units.get(subtype) : null; + } + static final UnicodeSet ASCII = new UnicodeSet('a', 'z').freeze(); static final UnicodeSet ASCII_HYPHEN_DIGITS = new UnicodeSet('-', '-', '0', '9', 'a', 'z').freeze(); From 2a66c4b0398cf0bc7a02e088f0bc3537bf2413ac Mon Sep 17 00:00:00 2001 From: younies Date: Tue, 25 Nov 2025 00:57:14 +0100 Subject: [PATCH 3/3] add test case that fails with the old code --- icu4c/source/test/intltest/numbertest_api.cpp | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/icu4c/source/test/intltest/numbertest_api.cpp b/icu4c/source/test/intltest/numbertest_api.cpp index 695a00936c2c..c5f96a4ad993 100644 --- a/icu4c/source/test/intltest/numbertest_api.cpp +++ b/icu4c/source/test/intltest/numbertest_api.cpp @@ -888,6 +888,17 @@ void NumberFormatterApiTest::unitMeasure() { Locale("en"), 100, u"100"); + + // This test checks the behavior of using a fixed-size length for the units array with a fixed number of units. + // The array size is fixed, and we are now using a binary search approach. + assertFormatSingle( + u"One of the latest unit, volume-teaspoon", + u"measure-unit/volume-teaspoon", + u"unit/teaspoon", + NumberFormatter::with().unit(MeasureUnit::forIdentifier("teaspoon", status)), + Locale("en-US"), + 100, + u"100 tsp"); // TODO: desired behaviour for this "pathological" case? // Since this is pointless, we don't test that its behaviour doesn't change.