diff --git a/src/main/java/com/nvidia/spark/rapids/jni/OrcTimezoneInfo.java b/src/main/java/com/nvidia/spark/rapids/jni/OrcTimezoneInfo.java index 29b2631eb1..c4b7a49c45 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/OrcTimezoneInfo.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/OrcTimezoneInfo.java @@ -1,32 +1,94 @@ +/* + * Copyright (c) 2025-2026, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package com.nvidia.spark.rapids.jni; -import java.io.IOException; -import java.io.RandomAccessFile; -import java.net.URL; -import java.nio.ByteBuffer; -import java.nio.ByteOrder; -import java.nio.MappedByteBuffer; -import java.nio.channels.FileChannel; +import java.time.DateTimeException; +import java.time.Instant; +import java.time.LocalDate; +import java.time.ZoneId; +import java.time.zone.ZoneOffsetTransition; +import java.time.zone.ZoneOffsetTransitionRule; +import java.time.zone.ZoneRules; +import java.util.ArrayList; import java.util.Arrays; import java.util.List; +import java.util.TimeZone; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ConcurrentMap; /** - * Used to hold timezone info read from `java.util.TimeZone` - * This class is used for ORC timezone conversion. - * For the other timezone conversions, it uses `java.time.ZoneId` APIs. - * The information is generated from OpenJDK 8. So some timezones in newer JDKs are missing. - * The reason why we do not read timezone info directly from `java.util.TimeZone`: - * `sun.util.calendar.ZoneInfo` is not public API, on some JDK distributions (like Oracle JDK), - * it's not accessible, E.g.: report error: package sun.util.calendar is not visible + * Holds ORC timezone metadata generated at runtime from public java.time/java.util APIs. + * Historical transitions come from ZoneRules, while offsets before the first transition and + * future recurring DST behavior are validated against java.util.TimeZone so ORC rebasing matches + * SerializationUtils.convertBetweenTimezones semantics without relying on non-public ZoneInfo APIs. + * + *

Runtime dependency: because the metadata is generated on the fly from + * {@link java.util.TimeZone}/{@link java.time.zone.ZoneRules}, the exact transition table and + * recurring DST rule are determined by the JVM's bundled IANA {@code tzdata}. Different JDK + * distributions or {@code tzdata} versions may produce slightly different historical + * transitions or future-year DST offsets for the same zone id. This is strictly more correct + * than the previous frozen OpenJDK-8 snapshot, but users debugging cross-environment + * differences should first check the JVM's {@code tzdata} version. */ class OrcTimezoneInfo { - public OrcTimezoneInfo(int rawOffset, long[] transitions, int[] offsets) { + OrcTimezoneInfo( + int initialOffset, + int rawOffset, + long[] transitions, + int[] offsets, + DstRule dstRule) { + this.initialOffset = initialOffset; this.rawOffset = rawOffset; this.transitions = transitions; this.offsets = offsets; + this.dstRule = dstRule; + } + + /** + * SimpleTimeZone-compatible DST rule mode. The {@link #value} encoding must + * stay in sync with {@code dst_rule_mode} in {@code timezones.cu}. + */ + enum DstRuleMode { + DOM_MODE(0), + DOW_IN_MONTH_MODE(1), + DOW_GE_DOM_MODE(2), + DOW_LE_DOM_MODE(3); + + final int value; + DstRuleMode(int value) { this.value = value; } + } + + /** + * SimpleTimeZone-compatible DST rule time mode. The {@link #value} encoding + * must stay in sync with {@code dst_time_mode} in {@code timezones.cu}. + */ + enum DstTimeMode { + WALL_TIME(0), + STANDARD_TIME(1), + UTC_TIME(2); + + final int value; + DstTimeMode(int value) { this.value = value; } } // in milliseconds + int initialOffset; + + // in milliseconds. This is the standard/raw offset used for DST rule math. int rawOffset; // in milliseconds @@ -35,189 +97,318 @@ public OrcTimezoneInfo(int rawOffset, long[] transitions, int[] offsets) { // in milliseconds int[] offsets; - @Override - public String toString() { - return "OrcTimezoneInfo{" + - "rawOffset=" + rawOffset + - ", transitions=" + Arrays.toString(transitions) + - ", offsets=" + Arrays.toString(offsets) + - '}'; + /** + * DST rule extracted from java.util.SimpleTimeZone for computing offsets + * beyond the historical transition table. Null if the timezone has no DST. + * + * The CUDA kernel uses this to implement SimpleTimeZone.getOffset() on GPU, + * eliminating the need for pre-generated transition files for future dates. + */ + DstRule dstRule; + + /** + * Holds the DST rule parameters needed by the GPU kernel. + * These correspond to the fields of java.util.SimpleTimeZone. + * + * {@code startMode}/{@code endMode} are encoded as {@link DstRuleMode#value}; + * {@code startTimeMode}/{@code endTimeMode} as {@link DstTimeMode#value}. + * Fields stay {@code int} because the rule is serialized to a JNI + * {@code int[]} and the GPU kernel consumes the matching integer enum. + */ + static class DstRule { + int dstSavings; // DST offset in milliseconds (typically 3600000) + int startMonth; // 0-based (Calendar.JANUARY=0 .. Calendar.DECEMBER=11) + int startDay; // day-of-month or occurrence count depending on startMode + int startDayOfWeek; // Calendar day-of-week (1=Sun, ..., 7=Sat), 0 if DOM_MODE + int startTime; // milliseconds within day + int startTimeMode; // see DstTimeMode + int startMode; // see DstRuleMode + int endMonth; + int endDay; + int endDayOfWeek; + int endTime; + int endTimeMode; + int endMode; } - // The following is Static fields and methods. - // The `orc_timezone_info.data` file is generated from `sun.util.calendar.ZoneInfo` on OpenJDK 8 - // It first reads `transitions` and `offsets` fields from `ZoneInfo` via reflection. - // Then calculate the actual transition and offset values via: - // - actual transition = transition >> 12 - // - actual offset = offsets[transition & 0x0FL] - // For more details, please refer to `sun.util.calendar.ZoneInfo` source code. + // Reference years used to cross-check CPU vs. GPU DST offset computation. + // We include a near-future anchor (2060) to catch divergence within the + // typical application lifetime, plus two far-future anchors to exercise the + // recurring-rule fallback path well past any historical transition entry. + private static final int[] DST_RULE_VALIDATION_YEARS = {2060, 2400, 9997}; + // Lower bound of the range ORC supports (year 0001-01-01 UTC). Computed via + // java.time.LocalDate, which uses the proleptic Gregorian calendar, whereas + // java.util.TimeZone.getOffset(long) internally uses a hybrid Julian/Gregorian + // calendar with the 1582 cutover for date-field interpretations. In practice + // this difference does not affect offset lookup (which is purely instant-based + // for ZoneInfo), and zones with DST in year 0001 do not exist, so the two + // calendars agree on the offset at this instant. Kept as a single anchor so + // the GPU side matches whatever TimeZone.getOffset returns here. + private static final long MIN_SUPPORTED_ORC_UTC_MILLIS = utcMillisForDate(1, 0, 1); + private static final long HISTORICAL_TRANSITION_SCAN_STEP_MILLIS = 24L * 3600_000L; + + /** + * Extract DST rule by probing getOffset() or from ZoneRules transition rules. + * Returns null if the timezone has no DST. + */ + static DstRule extractDstRule(String timezoneId, TimeZone tz, ZoneRules rules) { + if (!tz.useDaylightTime()) { + return null; + } + DstRule dstRule = extractDstRuleByProbing(tz); + if (dstRule != null) { + return dstRule; + } + + dstRule = extractDstRuleFromZoneRules(timezoneId, tz, rules); + if (dstRule != null) { + return dstRule; + } + throw new IllegalStateException("Failed to extract ORC DST rule for timezone: " + timezoneId); + } + + private static DstRule extractDstRuleFromZoneRules(String timezoneId, TimeZone tz, + ZoneRules rules) { + List transitionRules = rules.getTransitionRules(); + if (transitionRules.isEmpty()) { + return null; + } + if (transitionRules.size() != 2) { + throw new IllegalStateException("Unsupported ORC DST rule count for timezone: " + timezoneId); + } + + ZoneOffsetTransitionRule startTransitionRule = null; + ZoneOffsetTransitionRule endTransitionRule = null; + for (ZoneOffsetTransitionRule transitionRule : transitionRules) { + int deltaMillis = (transitionRule.getOffsetAfter().getTotalSeconds() - + transitionRule.getOffsetBefore().getTotalSeconds()) * 1000; + if (deltaMillis > 0) { + startTransitionRule = transitionRule; + } else if (deltaMillis < 0) { + endTransitionRule = transitionRule; + } else { + throw new IllegalStateException("Unsupported zero-delta ORC DST rule for timezone: " + + timezoneId); + } + } + if (startTransitionRule == null || endTransitionRule == null) { + throw new IllegalStateException("Failed to identify ORC DST start/end rules for timezone: " + + timezoneId); + } - // Refer to `serializeTimezoneInfo` method for how to generate the file. - private static final String ORC_TIMEZONE_FILE = "orc_timezone_info.data"; + int dstSavings = (startTransitionRule.getOffsetAfter().getTotalSeconds() - + startTransitionRule.getOffsetBefore().getTotalSeconds()) * 1000; + int endDeltaMillis = (endTransitionRule.getOffsetBefore().getTotalSeconds() - + endTransitionRule.getOffsetAfter().getTotalSeconds()) * 1000; + if (dstSavings != endDeltaMillis) { + throw new IllegalStateException("Mismatched ORC DST savings for timezone: " + timezoneId); + } - // the mapped memory for the file - private static MappedByteBuffer serializedBuf = null; + DstRule rule = new DstRule(); + rule.dstSavings = dstSavings; + fillDstRuleFromTransitionRule(timezoneId, rule, startTransitionRule, true); + fillDstRuleFromTransitionRule(timezoneId, rule, endTransitionRule, false); - static { - readTimezoneInfoFromFile(); + if (!verifyDstRuleAcrossReferenceYears(tz, rule)) { + throw new IllegalStateException("ZoneRules ORC DST rule verification failed for timezone: " + + timezoneId); + } + return rule; } - private static void readTimezoneInfoFromFile() { - URL path = OrcTimezoneInfo.class.getClassLoader().getResource(ORC_TIMEZONE_FILE); - if (path == null) { - throw new RuntimeException("Can not find ORC timezone info file " + ORC_TIMEZONE_FILE); + private static void fillDstRuleFromTransitionRule(String timezoneId, DstRule rule, + ZoneOffsetTransitionRule transitionRule, boolean isStartRule) { + // We only accept rules shaped as "first on or after ", + // i.e. ZoneRules' positive-day-indicator form. A negative indicator would mean + // "last on or before day" (DOW_LE_DOM_MODE); we reject those here so that + // downstream code can assume DOW_GE_DOM_MODE unconditionally. + if (transitionRule.getDayOfWeek() == null || + transitionRule.getDayOfMonthIndicator() <= 0) { + throw new IllegalStateException("Unsupported ORC DST transition rule shape for timezone: " + + timezoneId); } - try (RandomAccessFile file = new RandomAccessFile(path.getPath(), "r"); - FileChannel fileChannel = file.getChannel()) { + int month = transitionRule.getMonth().getValue() - 1; + int day = transitionRule.getDayOfMonthIndicator(); + int dayOfWeek = toCalendarDayOfWeek(transitionRule.getDayOfWeek().getValue()); + int time = getTransitionRuleTimeMillis(transitionRule); + int timeMode = getTransitionRuleTimeMode(transitionRule); + // Guaranteed by the precondition above. + int mode = DstRuleMode.DOW_GE_DOM_MODE.value; - if (fileChannel.size() > 2 * 1024 * 1024) { // > 2M - throw new RuntimeException("Failed to load ORC timezone info, file is too large > 2M."); - } + if (isStartRule) { + rule.startMonth = month; + rule.startDay = day; + rule.startDayOfWeek = dayOfWeek; + rule.startTime = time; + rule.startTimeMode = timeMode; + rule.startMode = mode; + } else { + rule.endMonth = month; + rule.endDay = day; + rule.endDayOfWeek = dayOfWeek; + rule.endTime = time; + rule.endTimeMode = timeMode; + rule.endMode = mode; + } + } + + private static int getTransitionRuleTimeMillis( + ZoneOffsetTransitionRule transitionRule) { + int secondOfDay = transitionRule.isMidnightEndOfDay() ? + 24 * 3600 : + transitionRule.getLocalTime().toSecondOfDay(); + return secondOfDay * 1000; + } - // Map the file into memory - serializedBuf = fileChannel.map(FileChannel.MapMode.READ_ONLY, 0, fileChannel.size()); - } catch (IOException e) { - throw new RuntimeException("Failed to load ORC timezone info file " + ORC_TIMEZONE_FILE, e); + private static int getTransitionRuleTimeMode(ZoneOffsetTransitionRule transitionRule) { + ZoneOffsetTransitionRule.TimeDefinition timeDef = transitionRule.getTimeDefinition(); + if (ZoneOffsetTransitionRule.TimeDefinition.UTC == timeDef) { + return DstTimeMode.UTC_TIME.value; + } else if (ZoneOffsetTransitionRule.TimeDefinition.STANDARD == timeDef) { + return DstTimeMode.STANDARD_TIME.value; + } else { + return DstTimeMode.WALL_TIME.value; } } + private static int toCalendarDayOfWeek(int javaTimeDayOfWeek) { + return (javaTimeDayOfWeek % 7) + 1; + } + /** - * Get timezone info for the specified timezone Id - * @param timezoneId timezone Id - * @return timezone info + * Extract DST rule by probing getOffset() at hourly intervals in a reference year. + * This works for any TimeZone implementation (ZoneInfo, SimpleTimeZone, etc.) + * and captures the effective DST rule as the JVM sees it. + * + * We find the exact DST start and end transitions, then encode them in the + * same format that SimpleTimeZone uses internally (month, day, dayOfWeek, time, mode). */ - public static OrcTimezoneInfo get(String timezoneId) { - int index = Arrays.binarySearch(timezoneIds, timezoneId); - if (index < 0) { - throw new IllegalArgumentException("Timezone ID not found: " + timezoneId); + private static DstRule extractDstRuleByProbing(TimeZone tz) { + for (int refYear : DST_RULE_VALIDATION_YEARS) { + DstRule rule = extractDstRuleByProbing(tz, refYear); + if (rule != null && verifyDstRuleAcrossReferenceYears(tz, rule)) { + return rule; + } } + return null; + } - // shallow copy - ByteBuffer buf = serializedBuf.duplicate(); - buf.order(ByteOrder.BIG_ENDIAN); - - int timezoneInfoOffsetInFile = buf.getInt(Integer.BYTES * index); - buf.position(timezoneInfoOffsetInFile); + private static DstRule extractDstRuleByProbing(TimeZone tz, int refYear) { + long janFirst = utcMillisForDate(refYear, 0, 1); + long nextJanFirst = utcMillisForDate(refYear + 1, 0, 1); - int rawOffsets = buf.getInt(); + // Find DST-on and DST-off transitions by scanning hourly + long dstOnTransition = -1; + long dstOffTransition = -1; + int prevOffset = tz.getOffset(janFirst - 1); + long step = 3600_000L; // 1 hour - int numTransitions = buf.getInt(); - long[] transitions = new long[numTransitions]; - for (int i = 0; i < numTransitions; ++i) { - transitions[i] = buf.getLong(); + for (long ms = janFirst; ms < nextJanFirst; ms += step) { + int curOffset = tz.getOffset(ms); + if (curOffset != prevOffset) { + // Found a transition; narrow down to exact millisecond with binary search + long exactMs = binarySearchTransition(tz, ms - step, ms); + if (curOffset > prevOffset) { + // More than one DST-on transition in the same year means this year + // doesn't fit a SimpleTimeZone-style two-transition rule; let the + // caller fall back to extractDstRuleFromZoneRules. + if (dstOnTransition >= 0) return null; + dstOnTransition = exactMs; + } else { + if (dstOffTransition >= 0) return null; + dstOffTransition = exactMs; + } + prevOffset = curOffset; + } } - int numOffsets = buf.getInt(); - int[] offsets = new int[numOffsets]; - for (int i = 0; i < numOffsets; ++i) { - offsets[i] = buf.getInt(); + if (dstOnTransition < 0 || dstOffTransition < 0) { + return null; } - return new OrcTimezoneInfo(rawOffsets, transitions, offsets); + DstRule rule = new DstRule(); + rule.dstSavings = tz.getDSTSavings(); + + // decodeTransition converts to standard local time, so the rule time mode is STANDARD_TIME. + int[] startFields = decodeTransition(dstOnTransition, tz.getRawOffset()); + rule.startMonth = startFields[0]; + rule.startDay = startFields[1]; + rule.startDayOfWeek = startFields[2]; + rule.startTime = startFields[3]; + rule.startTimeMode = DstTimeMode.STANDARD_TIME.value; + rule.startMode = startFields[4]; + + int[] endFields = decodeTransition(dstOffTransition, tz.getRawOffset()); + rule.endMonth = endFields[0]; + rule.endDay = endFields[1]; + rule.endDayOfWeek = endFields[2]; + rule.endTime = endFields[3]; + rule.endTimeMode = DstTimeMode.STANDARD_TIME.value; + rule.endMode = endFields[4]; + + return rule; } - public static List getAllTimezoneIds() { - return Arrays.asList(timezoneIds); + private static boolean verifyDstRuleAcrossReferenceYears(TimeZone tz, DstRule rule) { + for (int refYear : DST_RULE_VALIDATION_YEARS) { + if (!verifyDstRule(tz, rule, refYear)) { + return false; + } + } + return true; } - private static final String[] timezoneIds = {"ACT", "AET", "AGT", "ART", "AST", "Africa/Abidjan", "Africa/Accra", "Africa/Addis_Ababa", "Africa/Algiers", "Africa/Asmara", "Africa/Asmera", "Africa/Bamako", "Africa/Bangui", "Africa/Banjul", "Africa/Bissau", "Africa/Blantyre", "Africa/Brazzaville", "Africa/Bujumbura", "Africa/Cairo", "Africa/Casablanca", "Africa/Ceuta", "Africa/Conakry", "Africa/Dakar", "Africa/Dar_es_Salaam", "Africa/Djibouti", "Africa/Douala", "Africa/El_Aaiun", "Africa/Freetown", "Africa/Gaborone", "Africa/Harare", "Africa/Johannesburg", "Africa/Juba", "Africa/Kampala", "Africa/Khartoum", "Africa/Kigali", "Africa/Kinshasa", "Africa/Lagos", "Africa/Libreville", "Africa/Lome", "Africa/Luanda", "Africa/Lubumbashi", "Africa/Lusaka", "Africa/Malabo", "Africa/Maputo", "Africa/Maseru", "Africa/Mbabane", "Africa/Mogadishu", "Africa/Monrovia", "Africa/Nairobi", "Africa/Ndjamena", "Africa/Niamey", "Africa/Nouakchott", "Africa/Ouagadougou", "Africa/Porto-Novo", "Africa/Sao_Tome", "Africa/Timbuktu", "Africa/Tripoli", "Africa/Tunis", "Africa/Windhoek", "America/Adak", "America/Anchorage", "America/Anguilla", "America/Antigua", "America/Araguaina", "America/Argentina/Buenos_Aires", "America/Argentina/Catamarca", "America/Argentina/ComodRivadavia", "America/Argentina/Cordoba", "America/Argentina/Jujuy", "America/Argentina/La_Rioja", "America/Argentina/Mendoza", "America/Argentina/Rio_Gallegos", "America/Argentina/Salta", "America/Argentina/San_Juan", "America/Argentina/San_Luis", "America/Argentina/Tucuman", "America/Argentina/Ushuaia", "America/Aruba", "America/Asuncion", "America/Atikokan", "America/Atka", "America/Bahia", "America/Bahia_Banderas", "America/Barbados", "America/Belem", "America/Belize", "America/Blanc-Sablon", "America/Boa_Vista", "America/Bogota", "America/Boise", "America/Buenos_Aires", "America/Cambridge_Bay", "America/Campo_Grande", "America/Cancun", "America/Caracas", "America/Catamarca", "America/Cayenne", "America/Cayman", "America/Chicago", "America/Chihuahua", "America/Ciudad_Juarez", "America/Coral_Harbour", "America/Cordoba", "America/Costa_Rica", "America/Coyhaique", "America/Creston", "America/Cuiaba", "America/Curacao", "America/Danmarkshavn", "America/Dawson", "America/Dawson_Creek", "America/Denver", "America/Detroit", "America/Dominica", "America/Edmonton", "America/Eirunepe", "America/El_Salvador", "America/Ensenada", "America/Fort_Nelson", "America/Fort_Wayne", "America/Fortaleza", "America/Glace_Bay", "America/Godthab", "America/Goose_Bay", "America/Grand_Turk", "America/Grenada", "America/Guadeloupe", "America/Guatemala", "America/Guayaquil", "America/Guyana", "America/Halifax", "America/Havana", "America/Hermosillo", "America/Indiana/Indianapolis", "America/Indiana/Knox", "America/Indiana/Marengo", "America/Indiana/Petersburg", "America/Indiana/Tell_City", "America/Indiana/Vevay", "America/Indiana/Vincennes", "America/Indiana/Winamac", "America/Indianapolis", "America/Inuvik", "America/Iqaluit", "America/Jamaica", "America/Jujuy", "America/Juneau", "America/Kentucky/Louisville", "America/Kentucky/Monticello", "America/Knox_IN", "America/Kralendijk", "America/La_Paz", "America/Lima", "America/Los_Angeles", "America/Louisville", "America/Lower_Princes", "America/Maceio", "America/Managua", "America/Manaus", "America/Marigot", "America/Martinique", "America/Matamoros", "America/Mazatlan", "America/Mendoza", "America/Menominee", "America/Merida", "America/Metlakatla", "America/Mexico_City", "America/Miquelon", "America/Moncton", "America/Monterrey", "America/Montevideo", "America/Montreal", "America/Montserrat", "America/Nassau", "America/New_York", "America/Nipigon", "America/Nome", "America/Noronha", "America/North_Dakota/Beulah", "America/North_Dakota/Center", "America/North_Dakota/New_Salem", "America/Nuuk", "America/Ojinaga", "America/Panama", "America/Pangnirtung", "America/Paramaribo", "America/Phoenix", "America/Port-au-Prince", "America/Port_of_Spain", "America/Porto_Acre", "America/Porto_Velho", "America/Puerto_Rico", "America/Punta_Arenas", "America/Rainy_River", "America/Rankin_Inlet", "America/Recife", "America/Regina", "America/Resolute", "America/Rio_Branco", "America/Rosario", "America/Santa_Isabel", "America/Santarem", "America/Santiago", "America/Santo_Domingo", "America/Sao_Paulo", "America/Scoresbysund", "America/Shiprock", "America/Sitka", "America/St_Barthelemy", "America/St_Johns", "America/St_Kitts", "America/St_Lucia", "America/St_Thomas", "America/St_Vincent", "America/Swift_Current", "America/Tegucigalpa", "America/Thule", "America/Thunder_Bay", "America/Tijuana", "America/Toronto", "America/Tortola", "America/Vancouver", "America/Virgin", "America/Whitehorse", "America/Winnipeg", "America/Yakutat", "America/Yellowknife", "Antarctica/Casey", "Antarctica/Davis", "Antarctica/DumontDUrville", "Antarctica/Macquarie", "Antarctica/Mawson", "Antarctica/McMurdo", "Antarctica/Palmer", "Antarctica/Rothera", "Antarctica/South_Pole", "Antarctica/Syowa", "Antarctica/Troll", "Antarctica/Vostok", "Arctic/Longyearbyen", "Asia/Aden", "Asia/Almaty", "Asia/Amman", "Asia/Anadyr", "Asia/Aqtau", "Asia/Aqtobe", "Asia/Ashgabat", "Asia/Ashkhabad", "Asia/Atyrau", "Asia/Baghdad", "Asia/Bahrain", "Asia/Baku", "Asia/Bangkok", "Asia/Barnaul", "Asia/Beirut", "Asia/Bishkek", "Asia/Brunei", "Asia/Calcutta", "Asia/Chita", "Asia/Choibalsan", "Asia/Chongqing", "Asia/Chungking", "Asia/Colombo", "Asia/Dacca", "Asia/Damascus", "Asia/Dhaka", "Asia/Dili", "Asia/Dubai", "Asia/Dushanbe", "Asia/Famagusta", "Asia/Gaza", "Asia/Harbin", "Asia/Hebron", "Asia/Ho_Chi_Minh", "Asia/Hong_Kong", "Asia/Hovd", "Asia/Irkutsk", "Asia/Istanbul", "Asia/Jakarta", "Asia/Jayapura", "Asia/Jerusalem", "Asia/Kabul", "Asia/Kamchatka", "Asia/Karachi", "Asia/Kashgar", "Asia/Kathmandu", "Asia/Katmandu", "Asia/Khandyga", "Asia/Kolkata", "Asia/Krasnoyarsk", "Asia/Kuala_Lumpur", "Asia/Kuching", "Asia/Kuwait", "Asia/Macao", "Asia/Macau", "Asia/Magadan", "Asia/Makassar", "Asia/Manila", "Asia/Muscat", "Asia/Nicosia", "Asia/Novokuznetsk", "Asia/Novosibirsk", "Asia/Omsk", "Asia/Oral", "Asia/Phnom_Penh", "Asia/Pontianak", "Asia/Pyongyang", "Asia/Qatar", "Asia/Qostanay", "Asia/Qyzylorda", "Asia/Rangoon", "Asia/Riyadh", "Asia/Saigon", "Asia/Sakhalin", "Asia/Samarkand", "Asia/Seoul", "Asia/Shanghai", "Asia/Singapore", "Asia/Srednekolymsk", "Asia/Taipei", "Asia/Tashkent", "Asia/Tbilisi", "Asia/Tehran", "Asia/Tel_Aviv", "Asia/Thimbu", "Asia/Thimphu", "Asia/Tokyo", "Asia/Tomsk", "Asia/Ujung_Pandang", "Asia/Ulaanbaatar", "Asia/Ulan_Bator", "Asia/Urumqi", "Asia/Ust-Nera", "Asia/Vientiane", "Asia/Vladivostok", "Asia/Yakutsk", "Asia/Yangon", "Asia/Yekaterinburg", "Asia/Yerevan", "Atlantic/Azores", "Atlantic/Bermuda", "Atlantic/Canary", "Atlantic/Cape_Verde", "Atlantic/Faeroe", "Atlantic/Faroe", "Atlantic/Jan_Mayen", "Atlantic/Madeira", "Atlantic/Reykjavik", "Atlantic/South_Georgia", "Atlantic/St_Helena", "Atlantic/Stanley", "Australia/ACT", "Australia/Adelaide", "Australia/Brisbane", "Australia/Broken_Hill", "Australia/Canberra", "Australia/Currie", "Australia/Darwin", "Australia/Eucla", "Australia/Hobart", "Australia/LHI", "Australia/Lindeman", "Australia/Lord_Howe", "Australia/Melbourne", "Australia/NSW", "Australia/North", "Australia/Perth", "Australia/Queensland", "Australia/South", "Australia/Sydney", "Australia/Tasmania", "Australia/Victoria", "Australia/West", "Australia/Yancowinna", "BET", "BST", "Brazil/Acre", "Brazil/DeNoronha", "Brazil/East", "Brazil/West", "CAT", "CET", "CNT", "CST", "CST6CDT", "CTT", "Canada/Atlantic", "Canada/Central", "Canada/Eastern", "Canada/Mountain", "Canada/Newfoundland", "Canada/Pacific", "Canada/Saskatchewan", "Canada/Yukon", "Chile/Continental", "Chile/EasterIsland", "Cuba", "EAT", "ECT", "EET", "EST", "EST5EDT", "Egypt", "Eire", "Etc/GMT", "Etc/GMT+0", "Etc/GMT+1", "Etc/GMT+10", "Etc/GMT+11", "Etc/GMT+12", "Etc/GMT+2", "Etc/GMT+3", "Etc/GMT+4", "Etc/GMT+5", "Etc/GMT+6", "Etc/GMT+7", "Etc/GMT+8", "Etc/GMT+9", "Etc/GMT-0", "Etc/GMT-1", "Etc/GMT-10", "Etc/GMT-11", "Etc/GMT-12", "Etc/GMT-13", "Etc/GMT-14", "Etc/GMT-2", "Etc/GMT-3", "Etc/GMT-4", "Etc/GMT-5", "Etc/GMT-6", "Etc/GMT-7", "Etc/GMT-8", "Etc/GMT-9", "Etc/GMT0", "Etc/Greenwich", "Etc/UCT", "Etc/UTC", "Etc/Universal", "Etc/Zulu", "Europe/Amsterdam", "Europe/Andorra", "Europe/Astrakhan", "Europe/Athens", "Europe/Belfast", "Europe/Belgrade", "Europe/Berlin", "Europe/Bratislava", "Europe/Brussels", "Europe/Bucharest", "Europe/Budapest", "Europe/Busingen", "Europe/Chisinau", "Europe/Copenhagen", "Europe/Dublin", "Europe/Gibraltar", "Europe/Guernsey", "Europe/Helsinki", "Europe/Isle_of_Man", "Europe/Istanbul", "Europe/Jersey", "Europe/Kaliningrad", "Europe/Kiev", "Europe/Kirov", "Europe/Kyiv", "Europe/Lisbon", "Europe/Ljubljana", "Europe/London", "Europe/Luxembourg", "Europe/Madrid", "Europe/Malta", "Europe/Mariehamn", "Europe/Minsk", "Europe/Monaco", "Europe/Moscow", "Europe/Nicosia", "Europe/Oslo", "Europe/Paris", "Europe/Podgorica", "Europe/Prague", "Europe/Riga", "Europe/Rome", "Europe/Samara", "Europe/San_Marino", "Europe/Sarajevo", "Europe/Saratov", "Europe/Simferopol", "Europe/Skopje", "Europe/Sofia", "Europe/Stockholm", "Europe/Tallinn", "Europe/Tirane", "Europe/Tiraspol", "Europe/Ulyanovsk", "Europe/Uzhgorod", "Europe/Vaduz", "Europe/Vatican", "Europe/Vienna", "Europe/Vilnius", "Europe/Volgograd", "Europe/Warsaw", "Europe/Zagreb", "Europe/Zaporozhye", "Europe/Zurich", "GB", "GB-Eire", "GMT", "GMT0", "Greenwich", "HST", "Hongkong", "IET", "IST", "Iceland", "Indian/Antananarivo", "Indian/Chagos", "Indian/Christmas", "Indian/Cocos", "Indian/Comoro", "Indian/Kerguelen", "Indian/Mahe", "Indian/Maldives", "Indian/Mauritius", "Indian/Mayotte", "Indian/Reunion", "Iran", "Israel", "JST", "Jamaica", "Japan", "Kwajalein", "Libya", "MET", "MIT", "MST", "MST7MDT", "Mexico/BajaNorte", "Mexico/BajaSur", "Mexico/General", "NET", "NST", "NZ", "NZ-CHAT", "Navajo", "PLT", "PNT", "PRC", "PRT", "PST", "PST8PDT", "Pacific/Apia", "Pacific/Auckland", "Pacific/Bougainville", "Pacific/Chatham", "Pacific/Chuuk", "Pacific/Easter", "Pacific/Efate", "Pacific/Enderbury", "Pacific/Fakaofo", "Pacific/Fiji", "Pacific/Funafuti", "Pacific/Galapagos", "Pacific/Gambier", "Pacific/Guadalcanal", "Pacific/Guam", "Pacific/Honolulu", "Pacific/Johnston", "Pacific/Kanton", "Pacific/Kiritimati", "Pacific/Kosrae", "Pacific/Kwajalein", "Pacific/Majuro", "Pacific/Marquesas", "Pacific/Midway", "Pacific/Nauru", "Pacific/Niue", "Pacific/Norfolk", "Pacific/Noumea", "Pacific/Pago_Pago", "Pacific/Palau", "Pacific/Pitcairn", "Pacific/Pohnpei", "Pacific/Ponape", "Pacific/Port_Moresby", "Pacific/Rarotonga", "Pacific/Saipan", "Pacific/Samoa", "Pacific/Tahiti", "Pacific/Tarawa", "Pacific/Tongatapu", "Pacific/Truk", "Pacific/Wake", "Pacific/Wallis", "Pacific/Yap", "Poland", "Portugal", "ROK", "SST", "Singapore", "SystemV/AST4", "SystemV/AST4ADT", "SystemV/CST6", "SystemV/CST6CDT", "SystemV/EST5", "SystemV/EST5EDT", "SystemV/HST10", "SystemV/MST7", "SystemV/MST7MDT", "SystemV/PST8", "SystemV/PST8PDT", "SystemV/YST9", "SystemV/YST9YDT", "Turkey", "UCT", "US/Alaska", "US/Aleutian", "US/Arizona", "US/Central", "US/East-Indiana", "US/Eastern", "US/Hawaii", "US/Indiana-Starke", "US/Michigan", "US/Mountain", "US/Pacific", "US/Samoa", "UTC", "Universal", "VST", "W-SU", "WET", "Zulu"}; + private static long binarySearchTransition(TimeZone tz, long lo, long hi) { + int loOffset = tz.getOffset(lo); + while (hi - lo > 1) { + long mid = lo + (hi - lo) / 2; + if (tz.getOffset(mid) == loOffset) { + lo = mid; + } else { + hi = mid; + } + } + return hi; + } /** - * This method is only used to generate the timezone info file for maintenance purpose. + * Decode a UTC transition instant into (month, day, dayOfWeek, timeInDay, mode). + * Returns [month(0-11), day, dayOfWeek(1-7), timeMs, {@link DstRuleMode#value}]. * - * The generated file is based on OpenJDK 8's `sun.util.calendar.ZoneInfo` implementation. - * Since `ZoneInfo` is not public API, on some JDK distributions (like Oracle JDK), - * it's not accessible. So we comment the method out to avoid build issues. + * We encode recurring weekday rules as {@link DstRuleMode#DOW_GE_DOM_MODE}. + * For nth-weekday rules, the base day is the earliest possible day of that + * occurrence in the month: + * 1st => 1, 2nd => 8, 3rd => 15, 4th => 22. + * For last-weekday rules, the base day is the earliest day of the final week + * in the month, i.e. {@code monthLength - 6}. * - * File format: - * - First N * 4 bytes: N is number of timezone Ids - * - each 4 bytes is the offset of the timezone info in the file - * - Then each timezone info: - * - 4 bytes: rawOffset (int) - * - 4 bytes: numTransitions (int) - * - numTransitions * 8 bytes: transitions (long[]) - * - 4 bytes: numOffsets (int) - * - numOffsets * 4 bytes: offsets (int[]) - * - * How to do the maintenance: - * - update the `timezoneIds` via TimeZone.getAvailableIDs() and sort them. - * - run this method to generate the timezone info file, and copy the file to resources folder. + * This mirrors encodings such as "Sun >= 8" for the second Sunday in March + * and "Sun >= 25" for the last Sunday in October. */ - public static void serializeTimezoneInfo() { -// try { -// String path = "/tmp/orc_timezone_info.data"; -// -// // sort timezone ids -// String[] ids = TimeZone.getAvailableIDs(); -// ArrayList sortedIds = new ArrayList<>(Arrays.asList(ids)); -// sortedIds.sort(String::compareTo); -// -// List timezoneOffsets = new ArrayList<>(); -// DataOutputStream out = new DataOutputStream(Files.newOutputStream(Paths.get(path))); -// -// // from ZoneInfo source code -// long OFFSET_MASK_IN_ZONE_INFO = 0x0FL; -// int TRANSITION_NSHIFT_IN_ZONE_INFO = 12; -// -// // collect offsets for each timezone -// int timezoneOffsetInFile = 0; -// for (String id : sortedIds) { -// timezoneOffsets.add(timezoneOffsetInFile); -// -// ZoneInfo zoneInfo = (ZoneInfo) TimeZone.getTimeZone(id); -// long[] trans = (long[]) FieldUtils.readField(zoneInfo, "transitions"); -// int numTransitions = trans == null ? 0 : trans.length; -// -// // timezone serialized size calculation -// timezoneOffsetInFile += 4; // rawOffset -// timezoneOffsetInFile += 4; // numTransitions -// timezoneOffsetInFile += numTransitions * 8; // transitions longs -// timezoneOffsetInFile += 4; // numOffsets -// timezoneOffsetInFile += numTransitions * 4; // offsets ints -// } -// -// // First write all timezone offsets in the file -// int totalOffsetIndicesSize = sortedIds.size() * 4; -// for (int off : timezoneOffsets) { -// out.writeInt(off + totalOffsetIndicesSize); -// } -// -// // Then write each timezone info -// for (String id : sortedIds) { -// ZoneInfo zoneInfo = (ZoneInfo) TimeZone.getTimeZone(id); -// long[] trans = (long[]) FieldUtils.readField(zoneInfo, "transitions"); -// int[] offs = (int[]) FieldUtils.readField(zoneInfo, "offsets"); -// int rawOff = (int) FieldUtils.readField(zoneInfo, "rawOffset"); -// -// int numTransitions = trans == null ? 0 : trans.length; -// -// long[] actualTrans = new long[numTransitions]; -// int[] actualOffsets = new int[numTransitions]; -// for (int i = 0; i < numTransitions; ++i) { -// // `trans` is combination of transition and offset index -// actualTrans[i] = trans[i] >> TRANSITION_NSHIFT_IN_ZONE_INFO; -// // the `offs` is a dictionary, get the actual offset value via index -// // `trans[i] & OFFSET_MASK_IN_ZONE_INFO` is to get offset index -// actualOffsets[i] = offs[(int) (trans[i] & OFFSET_MASK_IN_ZONE_INFO)]; -// } -// -// out.writeInt(rawOff); -// -// out.writeInt(numTransitions); -// for (long t : actualTrans) { -// out.writeLong(t); -// } -// -// out.writeInt(numTransitions); -// for (int o : actualOffsets) { -// out.writeInt(o); -// } -// } -// out.flush(); -// out.close(); -// } catch (Exception e) { -// throw new RuntimeException("Failed to serialize ORC timezone info.", e); -// } + private static int[] decodeTransition(long utcMs, int rawOffsetMs) { + // Convert UTC ms to standard local time + long localMs = utcMs + rawOffsetMs; + java.time.Instant instant = java.time.Instant.ofEpochMilli(localMs); + java.time.LocalDateTime ldt = java.time.LocalDateTime.ofInstant( + instant, java.time.ZoneOffset.UTC); + + int month = ldt.getMonthValue() - 1; // 0-based for Calendar compat + int dayOfMonth = ldt.getDayOfMonth(); + // Calendar: 1=Sun..7=Sat + int dayOfWeek = toCalendarDayOfWeek(ldt.getDayOfWeek().getValue()); + int timeInDay = ldt.getHour() * 3600_000 + ldt.getMinute() * 60_000 + + ldt.getSecond() * 1000 + ldt.getNano() / 1_000_000; + + int monthLength = ldt.toLocalDate().lengthOfMonth(); + int dayOfWeekInMonth = (dayOfMonth - 1) / 7 + 1; + boolean isLastOccurrence = dayOfMonth + 7 > monthLength; + int baseDayOfMonth = isLastOccurrence ? + monthLength - 6 : + 1 + (dayOfWeekInMonth - 1) * 7; + + // DOW_GE_DOM: first on or after + return new int[]{month, baseDayOfMonth, dayOfWeek, timeInDay, DstRuleMode.DOW_GE_DOM_MODE.value}; } + }