diff --git a/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java b/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java index 0939b56c38..b509ecb77f 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDB.java @@ -24,13 +24,13 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.time.DateTimeException; import java.time.DayOfWeek; import java.time.Instant; import java.time.ZoneId; import java.time.zone.ZoneOffsetTransition; import java.time.zone.ZoneOffsetTransitionRule; import java.time.zone.ZoneRules; -import java.time.zone.ZoneRulesException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; @@ -42,7 +42,7 @@ /** * Gpu timezone utility. - * + *

* Provides the following APIs * - Timezone rebasing APIs: `fromTimestampToUtcTimestamp`, etc. * - Utilities for casting string with timezone to timestamp APIs @@ -66,7 +66,7 @@ public class GpuTimeZoneDB { * If a timezone has DST, then the list has 12 integers, which contains 2 * rules(start rule and end rule) * The integers in a list are: - * + *

* index 0: month:int, // from 1 (January) to 12 (December) * index 1: dayOfMonth: int, // from -28 to 31 excluding 0 * index 2: dayOfWeek: int, // from 0 (Monday) to 6 (Sunday), -1 means ignore @@ -189,7 +189,7 @@ public static boolean isSupportedTimeZone(String zoneId) { // check that zoneID is valid and supported by Java getZoneId(zoneId); return true; - } catch (ZoneRulesException e) { + } catch (DateTimeException e) { return false; } } @@ -531,7 +531,6 @@ private static ColumnVector getOffsetsForUtilTZ(OrcTimezoneInfo info) { private static Table getTableForUtilTZ(OrcTimezoneInfo info) { if (info.transitions == null) { - // fixed offset timezone return null; } try (ColumnVector trans = getTransitionsForUtilTZ(info); @@ -544,7 +543,10 @@ private static Table getTableForUtilTZ(OrcTimezoneInfo info) { /** * Only for testing purpose. - * Get all supported timezones for ORC timezone conversion. + * Get all supported timezones for ORC timezone conversion. The returned list + * is the same as {@link OrcTimezoneInfo#getAllTimezoneIds()}: it is already + * filtered to ids that {@link OrcTimezoneInfo#get(String)} can build, so + * callers do not need to pre-filter via {@link #isSupportedTimeZone(String)}. */ static List getOrcSupportedTimezones() { return OrcTimezoneInfo.getAllTimezoneIds(); diff --git a/src/main/java/com/nvidia/spark/rapids/jni/OrcTimezoneInfo.java b/src/main/java/com/nvidia/spark/rapids/jni/OrcTimezoneInfo.java index 29b2631eb1..348b3cfeb2 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/OrcTimezoneInfo.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/OrcTimezoneInfo.java @@ -1,23 +1,46 @@ +/* + * Copyright (c) 2026, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package com.nvidia.spark.rapids.jni; -import java.io.IOException; -import java.io.RandomAccessFile; -import java.net.URL; -import java.nio.ByteBuffer; -import java.nio.ByteOrder; -import java.nio.MappedByteBuffer; -import java.nio.channels.FileChannel; +import java.time.DateTimeException; +import java.time.Instant; +import java.time.LocalDate; +import java.time.ZoneId; +import java.time.zone.ZoneOffsetTransition; +import java.time.zone.ZoneRules; +import java.util.ArrayList; import java.util.Arrays; import java.util.List; +import java.util.TimeZone; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ConcurrentMap; /** - * Used to hold timezone info read from `java.util.TimeZone` - * This class is used for ORC timezone conversion. - * For the other timezone conversions, it uses `java.time.ZoneId` APIs. - * The information is generated from OpenJDK 8. So some timezones in newer JDKs are missing. - * The reason why we do not read timezone info directly from `java.util.TimeZone`: - * `sun.util.calendar.ZoneInfo` is not public API, on some JDK distributions (like Oracle JDK), - * it's not accessible, E.g.: report error: package sun.util.calendar is not visible + * Holds ORC timezone metadata generated at runtime from public java.time/java.util APIs. + * Historical transitions come from ZoneRules, while offsets before the first transition are + * derived from java.util.TimeZone so ORC rebasing matches + * SerializationUtils.convertBetweenTimezones semantics without relying on non-public ZoneInfo APIs. + * + *

Runtime dependency: because the metadata is generated on the fly from + * {@link java.util.TimeZone}/{@link java.time.zone.ZoneRules}, the exact transition table is + * determined by the JVM's bundled IANA {@code tzdata}. Different JDK distributions or + * {@code tzdata} versions may produce slightly different historical transitions for the same + * zone id. This is strictly more correct than the previous frozen OpenJDK-8 snapshot, but users + * debugging cross-environment differences should first check the JVM's {@code tzdata} version. */ class OrcTimezoneInfo { public OrcTimezoneInfo(int rawOffset, long[] transitions, int[] offsets) { @@ -27,13 +50,36 @@ public OrcTimezoneInfo(int rawOffset, long[] transitions, int[] offsets) { } // in milliseconds - int rawOffset; + final int rawOffset; // in milliseconds - long[] transitions; + final long[] transitions; // in milliseconds - int[] offsets; + final int[] offsets; + + // Lower bound of the range ORC supports (year 0001-01-01 UTC). Computed via + // java.time.LocalDate, which uses the proleptic Gregorian calendar, whereas + // java.util.TimeZone.getOffset(long) internally uses a hybrid Julian/Gregorian + // calendar with the 1582 cutover for date-field interpretations. In practice + // this difference does not affect offset lookup (which is purely instant-based + // for ZoneInfo), so the two calendars agree on the offset at this instant. + private static final long MIN_SUPPORTED_ORC_UTC_MILLIS = utcMillisForDate(1, 1, 1); + // Base probe width used by collectTimeZoneTransitionsByScanning. The scanner + // detects a transition by sampling tz.getOffset(probe) and comparing it to + // the running offset; a pair of transitions A->B->A whose two endpoints fall + // inside one probe step will net to zero and slip through. 6 hours is + // smaller than the minimum spacing between any two real transitions in the + // current IANA tzdata (the closest pairs are DST start/end, ~hours apart on + // separate days), so paired transitions cannot hide in a single window. + private static final long HISTORICAL_TRANSITION_SCAN_STEP_MILLIS = 6L * 3600_000L; + + // year, month, and day are all 1-indexed, matching LocalDate.of conventions + // (e.g. month=1 is January). This avoids the easy-to-misread mix of 0-based + // month and 1-based day at the call site. + private static long utcMillisForDate(int year, int month, int day) { + return LocalDate.of(year, month, day).toEpochDay() * 24L * 3600_000L; + } @Override public String toString() { @@ -44,180 +90,231 @@ public String toString() { '}'; } - // The following is Static fields and methods. - // The `orc_timezone_info.data` file is generated from `sun.util.calendar.ZoneInfo` on OpenJDK 8 - // It first reads `transitions` and `offsets` fields from `ZoneInfo` via reflection. - // Then calculate the actual transition and offset values via: - // - actual transition = transition >> 12 - // - actual offset = offsets[transition & 0x0FL] - // For more details, please refer to `sun.util.calendar.ZoneInfo` source code. - - // Refer to `serializeTimezoneInfo` method for how to generate the file. - private static final String ORC_TIMEZONE_FILE = "orc_timezone_info.data"; - - // the mapped memory for the file - private static MappedByteBuffer serializedBuf = null; + private static final ConcurrentMap RUNTIME_TIMEZONE_INFOS = + new ConcurrentHashMap<>(); - static { - readTimezoneInfoFromFile(); + /** + * Get timezone info for the specified timezone ID. + * Historical transitions are generated at runtime from public JVM APIs and cached per ID. + * + * @param timezoneId timezone ID + * @return timezone info + * @throws IllegalArgumentException if {@code timezoneId} is not a valid zone ID accepted + * by {@link GpuTimeZoneDB#getZoneId(String)}. There is no silent fallback to GMT. + */ + public static OrcTimezoneInfo get(String timezoneId) { + return RUNTIME_TIMEZONE_INFOS.computeIfAbsent( + timezoneId, + OrcTimezoneInfo::buildRuntimeOrcTimezoneInfo); } - private static void readTimezoneInfoFromFile() { - URL path = OrcTimezoneInfo.class.getClassLoader().getResource(ORC_TIMEZONE_FILE); - if (path == null) { - throw new RuntimeException("Can not find ORC timezone info file " + ORC_TIMEZONE_FILE); + /** + * Build ORC timezone metadata from public java.time/java.util APIs. Invalid IDs use the same + * validation as {@link GpuTimeZoneDB#getZoneId(String)} and fail with + * {@link IllegalArgumentException} (no silent fallback to GMT). + * + *

Cost: this is non-trivial — it scans every historical {@link ZoneOffsetTransition} + * from year 1 onward. Results are cached in {@link #RUNTIME_TIMEZONE_INFOS} (see + * {@link #get(String)}), so callers should always go through {@code get(...)} rather than + * invoking this directly. + */ + private static OrcTimezoneInfo buildRuntimeOrcTimezoneInfo(String timezoneId) { + final ZoneId zoneId; + try { + zoneId = GpuTimeZoneDB.getZoneId(timezoneId); + } catch (DateTimeException e) { + throw new IllegalArgumentException("Timezone ID not found: " + timezoneId, e); } - try (RandomAccessFile file = new RandomAccessFile(path.getPath(), "r"); - FileChannel fileChannel = file.getChannel()) { - - if (fileChannel.size() > 2 * 1024 * 1024) { // > 2M - throw new RuntimeException("Failed to load ORC timezone info, file is too large > 2M."); - } - - // Map the file into memory - serializedBuf = fileChannel.map(FileChannel.MapMode.READ_ONLY, 0, fileChannel.size()); - } catch (IOException e) { - throw new RuntimeException("Failed to load ORC timezone info file " + ORC_TIMEZONE_FILE, e); + ZoneRules rules = zoneId.getRules(); + if (rules.isFixedOffset()) { + // IDs like "+05:30" are valid ZoneIds but TimeZone.getTimeZone() silently + // maps them to GMT (offset 0). Derive the offset from ZoneRules instead so + // the GPU path doesn't treat them as UTC. + int fixedOffsetMs = rules.getOffset(Instant.EPOCH).getTotalSeconds() * 1000; + return new OrcTimezoneInfo(fixedOffsetMs, null, null); } + // Use the canonical ID from the resolved ZoneId (e.g. "Asia/Kolkata" for + // input "IST") so that TimeZone and ZoneRules always refer to the same + // zone, regardless of how the JVM's legacy TimeZone database maps + // 3-letter aliases. ZoneId.SHORT_IDS in getZoneId resolves "IST" to + // "Asia/Kolkata"; TimeZone.getTimeZone("IST") may map to a different + // zone on some JVM distributions, which would silently produce mixed + // offset data with no exception. + TimeZone tz = TimeZone.getTimeZone(zoneId.getId()); + List transitionList = rules.getTransitions(); + HistoricalTransitions historicalTransitions = buildHistoricalTransitions(tz, transitionList); + if (historicalTransitions.transitions == null) { + return new OrcTimezoneInfo(tz.getRawOffset(), null, null); + } + return new OrcTimezoneInfo(tz.getRawOffset(), + historicalTransitions.transitions, historicalTransitions.offsets); } /** - * Get timezone info for the specified timezone Id - * @param timezoneId timezone Id - * @return timezone info + * Returns the sorted list of timezone IDs that {@link #get(String)} can build — + * the intersection of {@link TimeZone#getAvailableIDs()} and + * {@link GpuTimeZoneDB#isSupportedTimeZone(String)}. POSIX-style entries (e.g. + * {@code "EST5EDT"}, {@code "SystemV/AST4"}) that some JDK builds expose but + * {@code ZoneId.of(id, ZoneId.SHORT_IDS)} rejects are filtered out. + * + *

The result is computed on every call; callers that need it repeatedly + * should cache it themselves. + * + * @return sorted list of ORC-supported timezone IDs */ - public static OrcTimezoneInfo get(String timezoneId) { - int index = Arrays.binarySearch(timezoneIds, timezoneId); - if (index < 0) { - throw new IllegalArgumentException("Timezone ID not found: " + timezoneId); + public static List getAllTimezoneIds() { + String[] ids = TimeZone.getAvailableIDs(); + Arrays.sort(ids); + List result = new ArrayList<>(ids.length); + for (String id : ids) { + if (GpuTimeZoneDB.isSupportedTimeZone(id)) { + result.add(id); + } + } + return result; + } + + private static int getInitialOffset(TimeZone tz) { + // ORC only supports timestamps from year 0001 onward. For dates before the + // first historical transition in that range, java.util.TimeZone can differ + // from ZoneRules' earliest wall offset (for example, it may use the zone's + // standard raw offset instead of an older LMT offset). Sample the beginning + // of the supported range so the GPU matches TimeZone.getOffset(). + return tz.getOffset(MIN_SUPPORTED_ORC_UTC_MILLIS); + } + + private static HistoricalTransitions buildHistoricalTransitions( + TimeZone tz, + List transitionList) { + if (transitionList.isEmpty()) { + return HistoricalTransitions.EMPTY; } - // shallow copy - ByteBuffer buf = serializedBuf.duplicate(); - buf.order(ByteOrder.BIG_ENDIAN); + List transitions = new ArrayList<>(); + List offsets = new ArrayList<>(); + long scanCursor = MIN_SUPPORTED_ORC_UTC_MILLIS; + int currentOffset = getInitialOffset(tz); - int timezoneInfoOffsetInFile = buf.getInt(Integer.BYTES * index); - buf.position(timezoneInfoOffsetInFile); + for (ZoneOffsetTransition transition : transitionList) { + long transitionMs = transition.getInstant().toEpochMilli(); + if (transitionMs < MIN_SUPPORTED_ORC_UTC_MILLIS) { + continue; + } - int rawOffsets = buf.getInt(); + long beforeTransitionMs = transitionMs - 1; + int offsetBeforeTransition = tz.getOffset(beforeTransitionMs); + // Invariant: between two consecutive entries returned by + // ZoneRules.getTransitions(), the wall offset is constant — no hidden + // paired round-trips (e.g. A->B->A) net to zero between entries. If + // that ever breaks (DST zones, future tzdata revisions), the guard + // below will not fire and both transitions in the pair will be + // silently dropped. The DST guard in + // GpuTimeZoneDB.convertOrcTimezones currently keeps this dormant; + // any follow-up that relaxes it must revisit this code. + if (beforeTransitionMs >= scanCursor && offsetBeforeTransition != currentOffset) { + currentOffset = collectTimeZoneTransitionsByScanning( + tz, scanCursor, beforeTransitionMs, currentOffset, transitions, offsets); + } - int numTransitions = buf.getInt(); - long[] transitions = new long[numTransitions]; - for (int i = 0; i < numTransitions; ++i) { - transitions[i] = buf.getLong(); + int offsetAtTransition = tz.getOffset(transitionMs); + if (offsetAtTransition != offsetBeforeTransition) { + transitions.add(transitionMs); + offsets.add(offsetAtTransition); + currentOffset = offsetAtTransition; + } + scanCursor = transitionMs; } - int numOffsets = buf.getInt(); - int[] offsets = new int[numOffsets]; - for (int i = 0; i < numOffsets; ++i) { - offsets[i] = buf.getInt(); + if (transitions.isEmpty()) { + return HistoricalTransitions.EMPTY; } + return new HistoricalTransitions(toLongArray(transitions), toIntArray(offsets)); + } - return new OrcTimezoneInfo(rawOffsets, transitions, offsets); + private static int collectTimeZoneTransitionsByScanning( + TimeZone tz, + long scanStartMs, + long scanEndMs, + int startOffset, + List transitions, + List offsets) { + long cursor = scanStartMs; + int currentOffset = startOffset; + while (cursor < scanEndMs) { + // Exponentially expand the probe step while the offset stays equal to + // currentOffset. This collapses long no-transition stretches (e.g. the + // year-0001-to-first-historical-transition gap, ~1880 years for typical + // IANA zones) from O(N) day probes to O(log N). Once the probe lands on + // a different offset, the [lo, hi] bracket contains a transition and we + // hand it to binarySearchTransition. The bracket may be wider than the + // base 6h step, so this assumes at most one offset transition lives in + // the expanded window — which holds for real IANA data; A->B->A pairs + // narrower than the base step are addressed separately by the step size. + long lo = cursor; + long step = HISTORICAL_TRANSITION_SCAN_STEP_MILLIS; + long hi = Math.min(lo + step, scanEndMs); + int hiOffset = tz.getOffset(hi); + while (hiOffset == currentOffset && hi < scanEndMs) { + lo = hi; + step = Math.min(step * 2L, scanEndMs - hi); + hi = lo + step; + hiOffset = tz.getOffset(hi); + } + if (hiOffset == currentOffset) { + // Reached scanEndMs without seeing any transition. + cursor = hi; + continue; + } + + long exactTransition = binarySearchTransition(tz, lo, hi); + int offsetAfterTransition = tz.getOffset(exactTransition); + transitions.add(exactTransition); + offsets.add(offsetAfterTransition); + currentOffset = offsetAfterTransition; + cursor = exactTransition; + } + return currentOffset; } - public static List getAllTimezoneIds() { - return Arrays.asList(timezoneIds); + private static long binarySearchTransition(TimeZone tz, long lo, long hi) { + int loOffset = tz.getOffset(lo); + while (hi - lo > 1) { + long mid = lo + (hi - lo) / 2; + if (tz.getOffset(mid) == loOffset) { + lo = mid; + } else { + hi = mid; + } + } + return hi; } - private static final String[] timezoneIds = {"ACT", "AET", "AGT", "ART", "AST", "Africa/Abidjan", "Africa/Accra", "Africa/Addis_Ababa", "Africa/Algiers", "Africa/Asmara", "Africa/Asmera", "Africa/Bamako", "Africa/Bangui", "Africa/Banjul", "Africa/Bissau", "Africa/Blantyre", "Africa/Brazzaville", "Africa/Bujumbura", "Africa/Cairo", "Africa/Casablanca", "Africa/Ceuta", "Africa/Conakry", "Africa/Dakar", "Africa/Dar_es_Salaam", "Africa/Djibouti", "Africa/Douala", "Africa/El_Aaiun", "Africa/Freetown", "Africa/Gaborone", "Africa/Harare", "Africa/Johannesburg", "Africa/Juba", "Africa/Kampala", "Africa/Khartoum", "Africa/Kigali", "Africa/Kinshasa", "Africa/Lagos", "Africa/Libreville", "Africa/Lome", "Africa/Luanda", "Africa/Lubumbashi", "Africa/Lusaka", "Africa/Malabo", "Africa/Maputo", "Africa/Maseru", "Africa/Mbabane", "Africa/Mogadishu", "Africa/Monrovia", "Africa/Nairobi", "Africa/Ndjamena", "Africa/Niamey", "Africa/Nouakchott", "Africa/Ouagadougou", "Africa/Porto-Novo", "Africa/Sao_Tome", "Africa/Timbuktu", "Africa/Tripoli", "Africa/Tunis", "Africa/Windhoek", "America/Adak", "America/Anchorage", "America/Anguilla", "America/Antigua", "America/Araguaina", "America/Argentina/Buenos_Aires", "America/Argentina/Catamarca", "America/Argentina/ComodRivadavia", "America/Argentina/Cordoba", "America/Argentina/Jujuy", "America/Argentina/La_Rioja", "America/Argentina/Mendoza", "America/Argentina/Rio_Gallegos", "America/Argentina/Salta", "America/Argentina/San_Juan", "America/Argentina/San_Luis", "America/Argentina/Tucuman", "America/Argentina/Ushuaia", "America/Aruba", "America/Asuncion", "America/Atikokan", "America/Atka", "America/Bahia", "America/Bahia_Banderas", "America/Barbados", "America/Belem", "America/Belize", "America/Blanc-Sablon", "America/Boa_Vista", "America/Bogota", "America/Boise", "America/Buenos_Aires", "America/Cambridge_Bay", "America/Campo_Grande", "America/Cancun", "America/Caracas", "America/Catamarca", "America/Cayenne", "America/Cayman", "America/Chicago", "America/Chihuahua", "America/Ciudad_Juarez", "America/Coral_Harbour", "America/Cordoba", "America/Costa_Rica", "America/Coyhaique", "America/Creston", "America/Cuiaba", "America/Curacao", "America/Danmarkshavn", "America/Dawson", "America/Dawson_Creek", "America/Denver", "America/Detroit", "America/Dominica", "America/Edmonton", "America/Eirunepe", "America/El_Salvador", "America/Ensenada", "America/Fort_Nelson", "America/Fort_Wayne", "America/Fortaleza", "America/Glace_Bay", "America/Godthab", "America/Goose_Bay", "America/Grand_Turk", "America/Grenada", "America/Guadeloupe", "America/Guatemala", "America/Guayaquil", "America/Guyana", "America/Halifax", "America/Havana", "America/Hermosillo", "America/Indiana/Indianapolis", "America/Indiana/Knox", "America/Indiana/Marengo", "America/Indiana/Petersburg", "America/Indiana/Tell_City", "America/Indiana/Vevay", "America/Indiana/Vincennes", "America/Indiana/Winamac", "America/Indianapolis", "America/Inuvik", "America/Iqaluit", "America/Jamaica", "America/Jujuy", "America/Juneau", "America/Kentucky/Louisville", "America/Kentucky/Monticello", "America/Knox_IN", "America/Kralendijk", "America/La_Paz", "America/Lima", "America/Los_Angeles", "America/Louisville", "America/Lower_Princes", "America/Maceio", "America/Managua", "America/Manaus", "America/Marigot", "America/Martinique", "America/Matamoros", "America/Mazatlan", "America/Mendoza", "America/Menominee", "America/Merida", "America/Metlakatla", "America/Mexico_City", "America/Miquelon", "America/Moncton", "America/Monterrey", "America/Montevideo", "America/Montreal", "America/Montserrat", "America/Nassau", "America/New_York", "America/Nipigon", "America/Nome", "America/Noronha", "America/North_Dakota/Beulah", "America/North_Dakota/Center", "America/North_Dakota/New_Salem", "America/Nuuk", "America/Ojinaga", "America/Panama", "America/Pangnirtung", "America/Paramaribo", "America/Phoenix", "America/Port-au-Prince", "America/Port_of_Spain", "America/Porto_Acre", "America/Porto_Velho", "America/Puerto_Rico", "America/Punta_Arenas", "America/Rainy_River", "America/Rankin_Inlet", "America/Recife", "America/Regina", "America/Resolute", "America/Rio_Branco", "America/Rosario", "America/Santa_Isabel", "America/Santarem", "America/Santiago", "America/Santo_Domingo", "America/Sao_Paulo", "America/Scoresbysund", "America/Shiprock", "America/Sitka", "America/St_Barthelemy", "America/St_Johns", "America/St_Kitts", "America/St_Lucia", "America/St_Thomas", "America/St_Vincent", "America/Swift_Current", "America/Tegucigalpa", "America/Thule", "America/Thunder_Bay", "America/Tijuana", "America/Toronto", "America/Tortola", "America/Vancouver", "America/Virgin", "America/Whitehorse", "America/Winnipeg", "America/Yakutat", "America/Yellowknife", "Antarctica/Casey", "Antarctica/Davis", "Antarctica/DumontDUrville", "Antarctica/Macquarie", "Antarctica/Mawson", "Antarctica/McMurdo", "Antarctica/Palmer", "Antarctica/Rothera", "Antarctica/South_Pole", "Antarctica/Syowa", "Antarctica/Troll", "Antarctica/Vostok", "Arctic/Longyearbyen", "Asia/Aden", "Asia/Almaty", "Asia/Amman", "Asia/Anadyr", "Asia/Aqtau", "Asia/Aqtobe", "Asia/Ashgabat", "Asia/Ashkhabad", "Asia/Atyrau", "Asia/Baghdad", "Asia/Bahrain", "Asia/Baku", "Asia/Bangkok", "Asia/Barnaul", "Asia/Beirut", "Asia/Bishkek", "Asia/Brunei", "Asia/Calcutta", "Asia/Chita", "Asia/Choibalsan", "Asia/Chongqing", "Asia/Chungking", "Asia/Colombo", "Asia/Dacca", "Asia/Damascus", "Asia/Dhaka", "Asia/Dili", "Asia/Dubai", "Asia/Dushanbe", "Asia/Famagusta", "Asia/Gaza", "Asia/Harbin", "Asia/Hebron", "Asia/Ho_Chi_Minh", "Asia/Hong_Kong", "Asia/Hovd", "Asia/Irkutsk", "Asia/Istanbul", "Asia/Jakarta", "Asia/Jayapura", "Asia/Jerusalem", "Asia/Kabul", "Asia/Kamchatka", "Asia/Karachi", "Asia/Kashgar", "Asia/Kathmandu", "Asia/Katmandu", "Asia/Khandyga", "Asia/Kolkata", "Asia/Krasnoyarsk", "Asia/Kuala_Lumpur", "Asia/Kuching", "Asia/Kuwait", "Asia/Macao", "Asia/Macau", "Asia/Magadan", "Asia/Makassar", "Asia/Manila", "Asia/Muscat", "Asia/Nicosia", "Asia/Novokuznetsk", "Asia/Novosibirsk", "Asia/Omsk", "Asia/Oral", "Asia/Phnom_Penh", "Asia/Pontianak", "Asia/Pyongyang", "Asia/Qatar", "Asia/Qostanay", "Asia/Qyzylorda", "Asia/Rangoon", "Asia/Riyadh", "Asia/Saigon", "Asia/Sakhalin", "Asia/Samarkand", "Asia/Seoul", "Asia/Shanghai", "Asia/Singapore", "Asia/Srednekolymsk", "Asia/Taipei", "Asia/Tashkent", "Asia/Tbilisi", "Asia/Tehran", "Asia/Tel_Aviv", "Asia/Thimbu", "Asia/Thimphu", "Asia/Tokyo", "Asia/Tomsk", "Asia/Ujung_Pandang", "Asia/Ulaanbaatar", "Asia/Ulan_Bator", "Asia/Urumqi", "Asia/Ust-Nera", "Asia/Vientiane", "Asia/Vladivostok", "Asia/Yakutsk", "Asia/Yangon", "Asia/Yekaterinburg", "Asia/Yerevan", "Atlantic/Azores", "Atlantic/Bermuda", "Atlantic/Canary", "Atlantic/Cape_Verde", "Atlantic/Faeroe", "Atlantic/Faroe", "Atlantic/Jan_Mayen", "Atlantic/Madeira", "Atlantic/Reykjavik", "Atlantic/South_Georgia", "Atlantic/St_Helena", "Atlantic/Stanley", "Australia/ACT", "Australia/Adelaide", "Australia/Brisbane", "Australia/Broken_Hill", "Australia/Canberra", "Australia/Currie", "Australia/Darwin", "Australia/Eucla", "Australia/Hobart", "Australia/LHI", "Australia/Lindeman", "Australia/Lord_Howe", "Australia/Melbourne", "Australia/NSW", "Australia/North", "Australia/Perth", "Australia/Queensland", "Australia/South", "Australia/Sydney", "Australia/Tasmania", "Australia/Victoria", "Australia/West", "Australia/Yancowinna", "BET", "BST", "Brazil/Acre", "Brazil/DeNoronha", "Brazil/East", "Brazil/West", "CAT", "CET", "CNT", "CST", "CST6CDT", "CTT", "Canada/Atlantic", "Canada/Central", "Canada/Eastern", "Canada/Mountain", "Canada/Newfoundland", "Canada/Pacific", "Canada/Saskatchewan", "Canada/Yukon", "Chile/Continental", "Chile/EasterIsland", "Cuba", "EAT", "ECT", "EET", "EST", "EST5EDT", "Egypt", "Eire", "Etc/GMT", "Etc/GMT+0", "Etc/GMT+1", "Etc/GMT+10", "Etc/GMT+11", "Etc/GMT+12", "Etc/GMT+2", "Etc/GMT+3", "Etc/GMT+4", "Etc/GMT+5", "Etc/GMT+6", "Etc/GMT+7", "Etc/GMT+8", "Etc/GMT+9", "Etc/GMT-0", "Etc/GMT-1", "Etc/GMT-10", "Etc/GMT-11", "Etc/GMT-12", "Etc/GMT-13", "Etc/GMT-14", "Etc/GMT-2", "Etc/GMT-3", "Etc/GMT-4", "Etc/GMT-5", "Etc/GMT-6", "Etc/GMT-7", "Etc/GMT-8", "Etc/GMT-9", "Etc/GMT0", "Etc/Greenwich", "Etc/UCT", "Etc/UTC", "Etc/Universal", "Etc/Zulu", "Europe/Amsterdam", "Europe/Andorra", "Europe/Astrakhan", "Europe/Athens", "Europe/Belfast", "Europe/Belgrade", "Europe/Berlin", "Europe/Bratislava", "Europe/Brussels", "Europe/Bucharest", "Europe/Budapest", "Europe/Busingen", "Europe/Chisinau", "Europe/Copenhagen", "Europe/Dublin", "Europe/Gibraltar", "Europe/Guernsey", "Europe/Helsinki", "Europe/Isle_of_Man", "Europe/Istanbul", "Europe/Jersey", "Europe/Kaliningrad", "Europe/Kiev", "Europe/Kirov", "Europe/Kyiv", "Europe/Lisbon", "Europe/Ljubljana", "Europe/London", "Europe/Luxembourg", "Europe/Madrid", "Europe/Malta", "Europe/Mariehamn", "Europe/Minsk", "Europe/Monaco", "Europe/Moscow", "Europe/Nicosia", "Europe/Oslo", "Europe/Paris", "Europe/Podgorica", "Europe/Prague", "Europe/Riga", "Europe/Rome", "Europe/Samara", "Europe/San_Marino", "Europe/Sarajevo", "Europe/Saratov", "Europe/Simferopol", "Europe/Skopje", "Europe/Sofia", "Europe/Stockholm", "Europe/Tallinn", "Europe/Tirane", "Europe/Tiraspol", "Europe/Ulyanovsk", "Europe/Uzhgorod", "Europe/Vaduz", "Europe/Vatican", "Europe/Vienna", "Europe/Vilnius", "Europe/Volgograd", "Europe/Warsaw", "Europe/Zagreb", "Europe/Zaporozhye", "Europe/Zurich", "GB", "GB-Eire", "GMT", "GMT0", "Greenwich", "HST", "Hongkong", "IET", "IST", "Iceland", "Indian/Antananarivo", "Indian/Chagos", "Indian/Christmas", "Indian/Cocos", "Indian/Comoro", "Indian/Kerguelen", "Indian/Mahe", "Indian/Maldives", "Indian/Mauritius", "Indian/Mayotte", "Indian/Reunion", "Iran", "Israel", "JST", "Jamaica", "Japan", "Kwajalein", "Libya", "MET", "MIT", "MST", "MST7MDT", "Mexico/BajaNorte", "Mexico/BajaSur", "Mexico/General", "NET", "NST", "NZ", "NZ-CHAT", "Navajo", "PLT", "PNT", "PRC", "PRT", "PST", "PST8PDT", "Pacific/Apia", "Pacific/Auckland", "Pacific/Bougainville", "Pacific/Chatham", "Pacific/Chuuk", "Pacific/Easter", "Pacific/Efate", "Pacific/Enderbury", "Pacific/Fakaofo", "Pacific/Fiji", "Pacific/Funafuti", "Pacific/Galapagos", "Pacific/Gambier", "Pacific/Guadalcanal", "Pacific/Guam", "Pacific/Honolulu", "Pacific/Johnston", "Pacific/Kanton", "Pacific/Kiritimati", "Pacific/Kosrae", "Pacific/Kwajalein", "Pacific/Majuro", "Pacific/Marquesas", "Pacific/Midway", "Pacific/Nauru", "Pacific/Niue", "Pacific/Norfolk", "Pacific/Noumea", "Pacific/Pago_Pago", "Pacific/Palau", "Pacific/Pitcairn", "Pacific/Pohnpei", "Pacific/Ponape", "Pacific/Port_Moresby", "Pacific/Rarotonga", "Pacific/Saipan", "Pacific/Samoa", "Pacific/Tahiti", "Pacific/Tarawa", "Pacific/Tongatapu", "Pacific/Truk", "Pacific/Wake", "Pacific/Wallis", "Pacific/Yap", "Poland", "Portugal", "ROK", "SST", "Singapore", "SystemV/AST4", "SystemV/AST4ADT", "SystemV/CST6", "SystemV/CST6CDT", "SystemV/EST5", "SystemV/EST5EDT", "SystemV/HST10", "SystemV/MST7", "SystemV/MST7MDT", "SystemV/PST8", "SystemV/PST8PDT", "SystemV/YST9", "SystemV/YST9YDT", "Turkey", "UCT", "US/Alaska", "US/Aleutian", "US/Arizona", "US/Central", "US/East-Indiana", "US/Eastern", "US/Hawaii", "US/Indiana-Starke", "US/Michigan", "US/Mountain", "US/Pacific", "US/Samoa", "UTC", "Universal", "VST", "W-SU", "WET", "Zulu"}; + private static long[] toLongArray(List values) { + long[] result = new long[values.size()]; + for (int i = 0; i < values.size(); i++) { + result[i] = values.get(i); + } + return result; + } - /** - * This method is only used to generate the timezone info file for maintenance purpose. - * - * The generated file is based on OpenJDK 8's `sun.util.calendar.ZoneInfo` implementation. - * Since `ZoneInfo` is not public API, on some JDK distributions (like Oracle JDK), - * it's not accessible. So we comment the method out to avoid build issues. - * - * File format: - * - First N * 4 bytes: N is number of timezone Ids - * - each 4 bytes is the offset of the timezone info in the file - * - Then each timezone info: - * - 4 bytes: rawOffset (int) - * - 4 bytes: numTransitions (int) - * - numTransitions * 8 bytes: transitions (long[]) - * - 4 bytes: numOffsets (int) - * - numOffsets * 4 bytes: offsets (int[]) - * - * How to do the maintenance: - * - update the `timezoneIds` via TimeZone.getAvailableIDs() and sort them. - * - run this method to generate the timezone info file, and copy the file to resources folder. - */ - public static void serializeTimezoneInfo() { -// try { -// String path = "/tmp/orc_timezone_info.data"; -// -// // sort timezone ids -// String[] ids = TimeZone.getAvailableIDs(); -// ArrayList sortedIds = new ArrayList<>(Arrays.asList(ids)); -// sortedIds.sort(String::compareTo); -// -// List timezoneOffsets = new ArrayList<>(); -// DataOutputStream out = new DataOutputStream(Files.newOutputStream(Paths.get(path))); -// -// // from ZoneInfo source code -// long OFFSET_MASK_IN_ZONE_INFO = 0x0FL; -// int TRANSITION_NSHIFT_IN_ZONE_INFO = 12; -// -// // collect offsets for each timezone -// int timezoneOffsetInFile = 0; -// for (String id : sortedIds) { -// timezoneOffsets.add(timezoneOffsetInFile); -// -// ZoneInfo zoneInfo = (ZoneInfo) TimeZone.getTimeZone(id); -// long[] trans = (long[]) FieldUtils.readField(zoneInfo, "transitions"); -// int numTransitions = trans == null ? 0 : trans.length; -// -// // timezone serialized size calculation -// timezoneOffsetInFile += 4; // rawOffset -// timezoneOffsetInFile += 4; // numTransitions -// timezoneOffsetInFile += numTransitions * 8; // transitions longs -// timezoneOffsetInFile += 4; // numOffsets -// timezoneOffsetInFile += numTransitions * 4; // offsets ints -// } -// -// // First write all timezone offsets in the file -// int totalOffsetIndicesSize = sortedIds.size() * 4; -// for (int off : timezoneOffsets) { -// out.writeInt(off + totalOffsetIndicesSize); -// } -// -// // Then write each timezone info -// for (String id : sortedIds) { -// ZoneInfo zoneInfo = (ZoneInfo) TimeZone.getTimeZone(id); -// long[] trans = (long[]) FieldUtils.readField(zoneInfo, "transitions"); -// int[] offs = (int[]) FieldUtils.readField(zoneInfo, "offsets"); -// int rawOff = (int) FieldUtils.readField(zoneInfo, "rawOffset"); -// -// int numTransitions = trans == null ? 0 : trans.length; -// -// long[] actualTrans = new long[numTransitions]; -// int[] actualOffsets = new int[numTransitions]; -// for (int i = 0; i < numTransitions; ++i) { -// // `trans` is combination of transition and offset index -// actualTrans[i] = trans[i] >> TRANSITION_NSHIFT_IN_ZONE_INFO; -// // the `offs` is a dictionary, get the actual offset value via index -// // `trans[i] & OFFSET_MASK_IN_ZONE_INFO` is to get offset index -// actualOffsets[i] = offs[(int) (trans[i] & OFFSET_MASK_IN_ZONE_INFO)]; -// } -// -// out.writeInt(rawOff); -// -// out.writeInt(numTransitions); -// for (long t : actualTrans) { -// out.writeLong(t); -// } -// -// out.writeInt(numTransitions); -// for (int o : actualOffsets) { -// out.writeInt(o); -// } -// } -// out.flush(); -// out.close(); -// } catch (Exception e) { -// throw new RuntimeException("Failed to serialize ORC timezone info.", e); -// } + private static int[] toIntArray(List values) { + int[] result = new int[values.size()]; + for (int i = 0; i < values.size(); i++) { + result[i] = values.get(i); + } + return result; + } + + private static final class HistoricalTransitions { + static final HistoricalTransitions EMPTY = new HistoricalTransitions(null, null); + + final long[] transitions; + final int[] offsets; + + private HistoricalTransitions(long[] transitions, int[] offsets) { + this.transitions = transitions; + this.offsets = offsets; + } } } diff --git a/src/main/resources/orc_timezone_info.data b/src/main/resources/orc_timezone_info.data deleted file mode 100644 index 38b0fb56dc..0000000000 Binary files a/src/main/resources/orc_timezone_info.data and /dev/null differ diff --git a/src/test/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDBTest.java b/src/test/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDBTest.java index c49f5de687..d35e9826da 100644 --- a/src/test/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDBTest.java +++ b/src/test/java/com/nvidia/spark/rapids/jni/GpuTimeZoneDBTest.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2025, NVIDIA CORPORATION. + * Copyright (c) 2025-2026, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,6 +21,9 @@ import org.junit.jupiter.api.Test; import static ai.rapids.cudf.AssertUtils.assertColumnsAreEqual; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; import java.time.LocalDate; import java.time.LocalDateTime; @@ -60,6 +63,40 @@ private static ColumnVector convertOrcTimezonesOnCPU( return ColumnVector.timestampMicroSecondsFromLongs(results); } + @Test + void testIsSupportedTimeZone() { + // Named zones with ZoneRules. + assertTrue(GpuTimeZoneDB.isSupportedTimeZone("UTC")); + assertTrue(GpuTimeZoneDB.isSupportedTimeZone("Asia/Shanghai")); + + // Unknown id. + assertFalse(GpuTimeZoneDB.isSupportedTimeZone("Invalid/Zone")); + + // Offset-style ids: "+05:30" must be accepted; malformed offsets must be + // rejected even when the parser throws DateTimeException rather than the + // narrower ZoneRulesException. This is the regression the widened catch in + // isSupportedTimeZone guards against. + assertTrue(GpuTimeZoneDB.isSupportedTimeZone("+05:30")); + assertFalse(GpuTimeZoneDB.isSupportedTimeZone("+25:00")); + } + + @Test + void testConvertOrcTimezonesRejectsInvalidId() { + // Invalid timezone IDs must surface an exception rather than silently + // falling back to GMT. The DST guard at the top of convertOrcTimezones + // calls ZoneId.of(...), so an unknown id will throw before the runtime + // build path or the GPU kernel ever runs. We assert the broad + // RuntimeException type so this stays a regression guard even if the + // exact wrapping (DateTimeException vs IllegalArgumentException vs + // IllegalStateException) is refactored later. + GpuTimeZoneDB.cacheDatabase(); + try (ColumnVector input = + ColumnVector.timestampMicroSecondsFromLongs(new long[] {0L})) { + assertThrows(RuntimeException.class, + () -> GpuTimeZoneDB.convertOrcTimezones(input, "Invalid/Zone", "UTC")); + } + } + @Test void testConvertOrcTimezones() { GpuTimeZoneDB.cacheDatabase(); diff --git a/src/test/java/com/nvidia/spark/rapids/jni/OrcTimezoneInfoTest.java b/src/test/java/com/nvidia/spark/rapids/jni/OrcTimezoneInfoTest.java new file mode 100644 index 0000000000..c34f2cc9a0 --- /dev/null +++ b/src/test/java/com/nvidia/spark/rapids/jni/OrcTimezoneInfoTest.java @@ -0,0 +1,119 @@ +/* + * Copyright (c) 2025-2026, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.nvidia.spark.rapids.jni; + +import org.junit.jupiter.api.Test; + +import java.util.List; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertSame; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class OrcTimezoneInfoTest { + + @Test + void testGetFixedOffsetZone() { + // Fixed-offset zones must return a non-null OrcTimezoneInfo with the + // offset derived from ZoneRules (not from TimeZone.getTimeZone, which + // would silently map "+05:30" to GMT). +05:30 == 19_800_000 ms. + OrcTimezoneInfo info = OrcTimezoneInfo.get("+05:30"); + assertNotNull(info); + assertEquals(19_800_000, info.rawOffset); + assertNull(info.transitions); + assertNull(info.offsets); + } + + @Test + void testGetFixedOffsetNamedZone() { + // "UTC" is a named zone whose ZoneRules.isFixedOffset() is true. Cover + // it explicitly so a regression that treats "UTC" as a historical zone + // (non-null transitions) — or that silently maps it to GMT via + // TimeZone.getTimeZone — is caught. rawOffset must be 0. + OrcTimezoneInfo info = OrcTimezoneInfo.get("UTC"); + assertNotNull(info); + assertEquals(0, info.rawOffset); + assertNull(info.transitions); + assertNull(info.offsets); + } + + @Test + void testGetCachesByKey() { + // computeIfAbsent must return the same instance on the second call so + // that other threads sharing RUNTIME_TIMEZONE_INFOS see a stable object. + OrcTimezoneInfo a = OrcTimezoneInfo.get("Asia/Kolkata"); + OrcTimezoneInfo b = OrcTimezoneInfo.get("Asia/Kolkata"); + assertSame(a, b); + } + + @Test + void testGetThrowsOnInvalidId() { + // Documented contract: invalid IDs throw IllegalArgumentException. + // There is no silent fallback to GMT. + assertThrows(IllegalArgumentException.class, + () -> OrcTimezoneInfo.get("Invalid/Zone")); + } + + @Test + void testGetAllTimezoneIdsContract() { + List ids = OrcTimezoneInfo.getAllTimezoneIds(); + + assertFalse(ids.isEmpty(), "expected at least one supported timezone id"); + assertTrue(ids.contains("UTC"), "UTC must be present"); + assertTrue(ids.contains("Asia/Shanghai"), "Asia/Shanghai must be present"); + + // Sorted ascending AND distinct (strict <, not <=, also catches duplicate ids). + for (int i = 1; i < ids.size(); i++) { + assertTrue(ids.get(i - 1).compareTo(ids.get(i)) < 0, + "list must be sorted and distinct: " + ids.get(i - 1) + " >= " + ids.get(i)); + } + + // Every id must be one that OrcTimezoneInfo.get can build — i.e. the lister + // and the loader agree. + for (String id : ids) { + assertTrue(GpuTimeZoneDB.isSupportedTimeZone(id), + "getAllTimezoneIds returned an id that isSupportedTimeZone rejects: " + id); + } + } + + @Test + void testGetHistoricalTransitionsZone() { + // Asia/Shanghai is a non-DST named zone with real historical transitions. + // Verify that the runtime build path populates both arrays consistently. + // + // Known coverage gap: zones whose ZoneRules.getTransitions() is empty + // but whose historical offset changed cannot exercise the scan-only + // path in collectTimeZoneTransitionsByScanning, because + // buildHistoricalTransitions returns EMPTY early for empty transition + // lists. Covering it would require a synthetic zone. + OrcTimezoneInfo info = OrcTimezoneInfo.get("Asia/Shanghai"); + assertNotNull(info); + assertNotNull(info.transitions, "Asia/Shanghai should have historical transitions"); + assertNotNull(info.offsets); + assertEquals(info.transitions.length, info.offsets.length, + "transitions and offsets must be the same length"); + // Transitions must be strictly increasing so the GPU binary search is well-defined. + for (int i = 1; i < info.transitions.length; i++) { + assertTrue(info.transitions[i] > info.transitions[i - 1], + "transitions must be strictly increasing"); + } + } +}