From 4ce9ad893861b1a9e49e3c8567aa4468c17465fc Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Thu, 9 Jan 2025 17:08:56 -0500
Subject: [PATCH v2 1/3] Seek zone abbreviations in the IANA data before
 timezone_abbreviations.

If a time zone abbreviation used in datetime input is defined in
the currently active timezone, use that definition in preference
to looking in the timezone_abbreviations list.  That allows us to
correctly handle abbreviations that have different meanings in
different timezones.  Also, it eliminates an inconsistency between
datetime input and datetime output: the non-ISO datestyles for
timestamptz have always printed abbreviations taken from the IANA
data not from timezone_abbreviations.  Before this fix, it was
possible to demonstrate cases where casting a timestamp to text
and back fails or changes the value significantly because of that
inconsistency.

While this change removes the ability to override the IANA data about
an abbreviation known in the current zone, it's not clear that there's
any real use-case for doing so.  But it is clear that this makes life
a lot easier for dealing with abbreviations that have conflicts.

There are a couple of loose ends still to deal with:

* As this patch stands, it causes a noticeable degradation of the
runtime of timestamptz_in (about 20% in a microbenchmark of just
that function).  This is from DecodeTimezoneAbbrev not caching
the results of its lookup in the new path.  I split out the
improvement of that part for a follow-up patch.

* The pg_timezone_abbrevs view shows only abbreviations from
the timezone_abbreviations list.  That should probably be
adjusted to account for abbreviations taken from the timezone.

Per report from Aleksander Alekseev and additional investigation.

Discussion: https://postgr.es/m/CAJ7c6TOATjJqvhnYsui0=CO5XFMF4dvTGH+skzB--jNhqSQu5g@mail.gmail.com
---
 doc/src/sgml/config.sgml                  |  6 +-
 doc/src/sgml/datatype.sgml                |  4 +
 doc/src/sgml/datetime.sgml                | 42 ++++++++++-
 src/backend/utils/adt/datetime.c          | 89 +++++++++++++++++++++--
 src/include/pgtime.h                      |  5 ++
 src/test/regress/expected/horology.out    |  6 ++
 src/test/regress/expected/timestamptz.out | 59 +++++++++++++++
 src/test/regress/sql/horology.sql         |  1 +
 src/test/regress/sql/timestamptz.sql      | 17 +++++
 src/timezone/localtime.c                  | 76 +++++++++++++++++++
 10 files changed, 294 insertions(+), 11 deletions(-)
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index f1ab614575..453d213966 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -10022,8 +10022,10 @@ SET XML OPTION { DOCUMENT | CONTENT };
       </term>
       <listitem>
        <para>
-        Sets the collection of time zone abbreviations that will be accepted
-        by the server for datetime input.  The default is <literal>'Default'</literal>,
+        Sets the collection of additional time zone abbreviations that
+        will be accepted by the server for datetime input (beyond any
+        abbreviations defined by the current <varname>TimeZone</varname>
+        setting).  The default is <literal>'Default'</literal>,
         which is a collection that works in most of the world; there are
         also <literal>'Australia'</literal> and <literal>'India'</literal>,
         and other collections can be defined for a particular installation.
diff --git a/doc/src/sgml/datatype.sgml b/doc/src/sgml/datatype.sgml
index 3e6751d64c..1d9127e94e 100644
--- a/doc/src/sgml/datatype.sgml
+++ b/doc/src/sgml/datatype.sgml
@@ -2534,6 +2534,10 @@ TIMESTAMP WITH TIME ZONE '2004-10-19 10:23:54+02'
      abbreviation if one is in common use in the current zone.  Otherwise
      it appears as a signed numeric offset in ISO 8601 basic format
      (<replaceable>hh</replaceable> or <replaceable>hhmm</replaceable>).
+     The alphabetic abbreviations shown in these styles are taken from the
+     IANA time zone database entry currently selected by the
+     <xref linkend="guc-timezone"/> run-time parameter; they are not
+     affected by the <xref linkend="guc-timezone-abbreviations"/> setting.
     </para>
 
     <para>
diff --git a/doc/src/sgml/datetime.sgml b/doc/src/sgml/datetime.sgml
index e7035c7806..3e24170acb 100644
--- a/doc/src/sgml/datetime.sgml
+++ b/doc/src/sgml/datetime.sgml
@@ -80,7 +80,7 @@
       <step>
        <para>
         See if the token matches any known time zone abbreviation.
-        These abbreviations are supplied by the configuration file
+        These abbreviations are determined by the configuration settings
         described in <xref linkend="datetime-config-files"/>.
        </para>
       </step>
@@ -424,9 +424,43 @@
    <para>
     Since timezone abbreviations are not well standardized,
     <productname>PostgreSQL</productname> provides a means to customize
-    the set of abbreviations accepted by the server.  The
-    <xref linkend="guc-timezone-abbreviations"/> run-time parameter
-    determines the active set of abbreviations.  While this parameter
+    the set of abbreviations accepted in datetime input.
+    There are two sources for these abbreviations:
+
+    <orderedlist>
+     <listitem>
+      <para>
+       The <xref linkend="guc-timezone"/> run-time parameter is usually
+       set to the name of an entry in the IANA time zone database.
+       If that zone has widely-used zone abbreviations, they will appear
+       in the IANA data, and <productname>PostgreSQL</productname> will
+       preferentially recognize those abbreviations with the meanings
+       given in the IANA data.
+       For example, if <varname>timezone</varname> is set
+       to <literal>America/New_York</literal> then <literal>EST</literal>
+       will be understood as UTC-5 and <literal>EDT</literal> will be
+       understood as UTC-4.  (These IANA abbreviations will also be used
+       in datetime output, if <xref linkend="guc-datestyle"/> is set to a
+       style that prefers non-numeric zone abbreviations.)
+      </para>
+     </listitem>
+
+     <listitem>
+      <para>
+       If an abbreviation is not found in the current IANA time zone,
+       it is sought in the list specified by the
+       <xref linkend="guc-timezone-abbreviations"/> run-time parameter.
+       The <varname>timezone_abbreviations</varname> list is primarily
+       useful for allowing datetime input to recognize abbreviations for
+       time zones other than the current zone.  (These abbreviations will
+       not be used in datetime output.)
+      </para>
+     </listitem>
+    </orderedlist>
+   </para>
+
+   <para>
+    While the <varname>timezone_abbreviations</varname> parameter
     can be altered by any database user, the possible values for it
     are under the control of the database administrator &mdash; they
     are in fact names of configuration files stored in
diff --git a/src/backend/utils/adt/datetime.c b/src/backend/utils/adt/datetime.c
index d8af3591d1..cb028d3934 100644
--- a/src/backend/utils/adt/datetime.c
+++ b/src/backend/utils/adt/datetime.c
@@ -1845,6 +1845,40 @@ DetermineTimeZoneAbbrevOffsetInternal(pg_time_t t, const char *abbr, pg_tz *tzp,
 }
 
 
+/* TimeZoneAbbrevIsKnown()
+ *
+ * Detect whether the given string is a time zone abbreviation that's known
+ * in the specified TZDB timezone, and if so whether it's fixed or varying
+ * meaning.  The match is not case-sensitive.
+ */
+static bool
+TimeZoneAbbrevIsKnown(const char *abbr, pg_tz *tzp,
+					  bool *isfixed, int *offset, int *isdst)
+{
+	char		upabbr[TZ_STRLEN_MAX + 1];
+	unsigned char *p;
+	long int	gmtoff;
+
+	/* We need to force the abbrev to upper case */
+	strlcpy(upabbr, abbr, sizeof(upabbr));
+	for (p = (unsigned char *) upabbr; *p; p++)
+		*p = pg_toupper(*p);
+
+	/* Look up the abbrev's meaning in this zone */
+	if (pg_timezone_abbrev_is_known(upabbr,
+									isfixed,
+									&gmtoff,
+									isdst,
+									tzp))
+	{
+		/* Change sign to agree with DetermineTimeZoneOffset() */
+		*offset = (int) -gmtoff;
+		return true;
+	}
+	return false;
+}
+
+
 /* DecodeTimeOnly()
  * Interpret parsed string as time fields only.
  * Returns 0 if successful, DTERR code if bogus input detected.
@@ -3092,8 +3126,28 @@ DecodeTimezoneAbbrev(int field, const char *lowtoken,
 					 int *ftype, int *offset, pg_tz **tz,
 					 DateTimeErrorExtra *extra)
 {
+	bool		isfixed;
+	int			isdst;
 	const datetkn *tp;
 
+	/*
+	 * See if the current session_timezone recognizes it.  Checking this
+	 * before zoneabbrevtbl allows us to correctly handle abbreviations whose
+	 * meaning varies across zones, such as "LMT".  (Caching this lookup is
+	 * left for later.)
+	 */
+	if (session_timezone &&
+		TimeZoneAbbrevIsKnown(lowtoken, session_timezone,
+							  &isfixed, offset, &isdst))
+	{
+		*ftype = (isfixed ? (isdst ? DTZ : TZ) : DYNTZ);
+		*tz = (isfixed ? NULL : session_timezone);
+		/* flip sign to agree with the convention used in zoneabbrevtbl */
+		*offset = -(*offset);
+		return 0;
+	}
+
+	/* Nope, so look in zoneabbrevtbl */
 	tp = abbrevcache[field];
 	/* use strncmp so that we match truncated tokens */
 	if (tp == NULL || strncmp(lowtoken, tp->token, TOKMAXLEN) != 0)
@@ -3109,6 +3163,7 @@ DecodeTimezoneAbbrev(int field, const char *lowtoken,
 		*ftype = UNKNOWN_FIELD;
 		*offset = 0;
 		*tz = NULL;
+		/* failure results are not cached */
 	}
 	else
 	{
@@ -3278,9 +3333,6 @@ DecodeTimezoneAbbrevPrefix(const char *str, int *offset, pg_tz **tz)
 	*offset = 0;				/* avoid uninitialized vars on failure */
 	*tz = NULL;
 
-	if (!zoneabbrevtbl)
-		return -1;				/* no abbrevs known, so fail immediately */
-
 	/* Downcase as much of the string as we could need */
 	for (len = 0; len < TOKMAXLEN; len++)
 	{
@@ -3299,9 +3351,34 @@ DecodeTimezoneAbbrevPrefix(const char *str, int *offset, pg_tz **tz)
 	 */
 	while (len > 0)
 	{
-		const datetkn *tp = datebsearch(lowtoken, zoneabbrevtbl->abbrevs,
-										zoneabbrevtbl->numabbrevs);
+		bool		isfixed;
+		int			isdst;
+		const datetkn *tp;
+
+		/* See if the current session_timezone recognizes it. */
+		if (session_timezone &&
+			TimeZoneAbbrevIsKnown(lowtoken, session_timezone,
+								  &isfixed, offset, &isdst))
+		{
+			if (isfixed)
+			{
+				/* flip sign to agree with the convention in zoneabbrevtbl */
+				*offset = -(*offset);
+			}
+			else
+			{
+				/* Caller must resolve the abbrev's current meaning */
+				*tz = session_timezone;
+			}
+			return len;
+		}
 
+		/* Known in zoneabbrevtbl? */
+		if (zoneabbrevtbl)
+			tp = datebsearch(lowtoken, zoneabbrevtbl->abbrevs,
+							 zoneabbrevtbl->numabbrevs);
+		else
+			tp = NULL;
 		if (tp != NULL)
 		{
 			if (tp->type == DYNTZ)
@@ -3324,6 +3401,8 @@ DecodeTimezoneAbbrevPrefix(const char *str, int *offset, pg_tz **tz)
 				return len;
 			}
 		}
+
+		/* Nope, try the next shorter string. */
 		lowtoken[--len] = '\0';
 	}
 
diff --git a/src/include/pgtime.h b/src/include/pgtime.h
index 37171f1737..b8b898a69c 100644
--- a/src/include/pgtime.h
+++ b/src/include/pgtime.h
@@ -69,6 +69,11 @@ extern bool pg_interpret_timezone_abbrev(const char *abbrev,
 										 long int *gmtoff,
 										 int *isdst,
 										 const pg_tz *tz);
+extern bool pg_timezone_abbrev_is_known(const char *abbrev,
+										bool *isfixed,
+										long int *gmtoff,
+										int *isdst,
+										const pg_tz *tz);
 extern bool pg_get_timezone_offset(const pg_tz *tz, long int *gmtoff);
 extern const char *pg_get_timezone_name(pg_tz *tz);
 extern bool pg_tz_acceptable(pg_tz *tz);
diff --git a/src/test/regress/expected/horology.out b/src/test/regress/expected/horology.out
index cb28dfbaee..b90bfcd794 100644
--- a/src/test/regress/expected/horology.out
+++ b/src/test/regress/expected/horology.out
@@ -3332,6 +3332,12 @@ SELECT to_timestamp('2011-12-18 11:38 MSK', 'YYYY-MM-DD HH12:MI TZ');  -- dyntz
  Sat Dec 17 23:38:00 2011 PST
 (1 row)
 
+SELECT to_timestamp('2011-12-18 00:00 LMT', 'YYYY-MM-DD HH24:MI TZ');  -- dyntz
+         to_timestamp         
+------------------------------
+ Sat Dec 17 23:52:58 2011 PST
+(1 row)
+
 SELECT to_timestamp('2011-12-18 11:38ESTFOO24', 'YYYY-MM-DD HH12:MITZFOOSS');
          to_timestamp         
 ------------------------------
diff --git a/src/test/regress/expected/timestamptz.out b/src/test/regress/expected/timestamptz.out
index a6dd45626c..36349e363f 100644
--- a/src/test/regress/expected/timestamptz.out
+++ b/src/test/regress/expected/timestamptz.out
@@ -176,6 +176,65 @@ SELECT '205000-01-10 17:32:01 Europe/Helsinki'::timestamptz; -- non-DST
  Fri Jan 10 07:32:01 205000 PST
 (1 row)
 
+-- Recognize "LMT" as whatever it means in the current zone
+SELECT 'Jan 01 00:00:00 1000 LMT'::timestamptz;
+         timestamptz          
+------------------------------
+ Wed Jan 01 00:00:00 1000 LMT
+(1 row)
+
+SELECT 'Jan 01 00:00:00 2024 LMT'::timestamptz;
+         timestamptz          
+------------------------------
+ Sun Dec 31 23:52:58 2023 PST
+(1 row)
+
+SET timezone = 'Europe/London';
+SELECT 'Jan 01 00:00:00 1000 LMT'::timestamptz;
+         timestamptz          
+------------------------------
+ Wed Jan 01 00:00:00 1000 LMT
+(1 row)
+
+SELECT 'Jan 01 00:00:00 2024 LMT'::timestamptz;
+         timestamptz          
+------------------------------
+ Mon Jan 01 00:01:15 2024 GMT
+(1 row)
+
+-- which might be nothing
+SET timezone = 'UTC';
+SELECT 'Jan 01 00:00:00 2024 LMT'::timestamptz;  -- fail
+ERROR:  invalid input syntax for type timestamp with time zone: "Jan 01 00:00:00 2024 LMT"
+LINE 1: SELECT 'Jan 01 00:00:00 2024 LMT'::timestamptz;
+               ^
+-- Another example of an abbrev that varies across zones
+SELECT '1912-01-01 00:00 MMT'::timestamptz;  -- from timezone_abbreviations
+         timestamptz          
+------------------------------
+ Sun Dec 31 17:30:00 1911 UTC
+(1 row)
+
+SET timezone = 'America/Montevideo';
+SELECT '1912-01-01 00:00'::timestamptz;
+         timestamptz          
+------------------------------
+ Mon Jan 01 00:00:00 1912 MMT
+(1 row)
+
+SELECT '1912-01-01 00:00 MMT'::timestamptz;
+         timestamptz          
+------------------------------
+ Mon Jan 01 00:00:00 1912 MMT
+(1 row)
+
+SELECT '1912-01-01 00:00 MMT'::timestamptz AT TIME ZONE 'UTC';
+         timezone         
+--------------------------
+ Mon Jan 01 03:44:51 1912
+(1 row)
+
+RESET timezone;
 -- Test non-error-throwing API
 SELECT pg_input_is_valid('now', 'timestamptz');
  pg_input_is_valid 
diff --git a/src/test/regress/sql/horology.sql b/src/test/regress/sql/horology.sql
index 4aa88b4ba9..1310b43277 100644
--- a/src/test/regress/sql/horology.sql
+++ b/src/test/regress/sql/horology.sql
@@ -538,6 +538,7 @@ SELECT to_timestamp('2011-12-18 11:38 EST', 'YYYY-MM-DD HH12:MI TZ');
 SELECT to_timestamp('2011-12-18 11:38 -05', 'YYYY-MM-DD HH12:MI TZ');
 SELECT to_timestamp('2011-12-18 11:38 +01:30', 'YYYY-MM-DD HH12:MI TZ');
 SELECT to_timestamp('2011-12-18 11:38 MSK', 'YYYY-MM-DD HH12:MI TZ');  -- dyntz
+SELECT to_timestamp('2011-12-18 00:00 LMT', 'YYYY-MM-DD HH24:MI TZ');  -- dyntz
 SELECT to_timestamp('2011-12-18 11:38ESTFOO24', 'YYYY-MM-DD HH12:MITZFOOSS');
 SELECT to_timestamp('2011-12-18 11:38-05FOO24', 'YYYY-MM-DD HH12:MITZFOOSS');
 SELECT to_timestamp('2011-12-18 11:38 JUNK', 'YYYY-MM-DD HH12:MI TZ');  -- error
diff --git a/src/test/regress/sql/timestamptz.sql b/src/test/regress/sql/timestamptz.sql
index a92586c363..2fa5378a57 100644
--- a/src/test/regress/sql/timestamptz.sql
+++ b/src/test/regress/sql/timestamptz.sql
@@ -109,6 +109,23 @@ SELECT '20500110 173201 Europe/Helsinki'::timestamptz; -- non-DST
 SELECT '205000-07-10 17:32:01 Europe/Helsinki'::timestamptz; -- DST
 SELECT '205000-01-10 17:32:01 Europe/Helsinki'::timestamptz; -- non-DST
 
+-- Recognize "LMT" as whatever it means in the current zone
+SELECT 'Jan 01 00:00:00 1000 LMT'::timestamptz;
+SELECT 'Jan 01 00:00:00 2024 LMT'::timestamptz;
+SET timezone = 'Europe/London';
+SELECT 'Jan 01 00:00:00 1000 LMT'::timestamptz;
+SELECT 'Jan 01 00:00:00 2024 LMT'::timestamptz;
+-- which might be nothing
+SET timezone = 'UTC';
+SELECT 'Jan 01 00:00:00 2024 LMT'::timestamptz;  -- fail
+-- Another example of an abbrev that varies across zones
+SELECT '1912-01-01 00:00 MMT'::timestamptz;  -- from timezone_abbreviations
+SET timezone = 'America/Montevideo';
+SELECT '1912-01-01 00:00'::timestamptz;
+SELECT '1912-01-01 00:00 MMT'::timestamptz;
+SELECT '1912-01-01 00:00 MMT'::timestamptz AT TIME ZONE 'UTC';
+RESET timezone;
+
 -- Test non-error-throwing API
 SELECT pg_input_is_valid('now', 'timestamptz');
 SELECT pg_input_is_valid('garbage', 'timestamptz');
diff --git a/src/timezone/localtime.c b/src/timezone/localtime.c
index 0bc160ea7d..65511ae8be 100644
--- a/src/timezone/localtime.c
+++ b/src/timezone/localtime.c
@@ -1843,6 +1843,82 @@ pg_interpret_timezone_abbrev(const char *abbrev,
 	return false;				/* hm, not actually used in any interval? */
 }
 
+/*
+ * Detect whether a timezone abbreviation is defined within the given zone.
+ *
+ * This is similar to pg_interpret_timezone_abbrev() but is not concerned
+ * with a specific point in time.  We want to know if the abbreviation is
+ * known at all, and if so whether it has one meaning or several.
+ *
+ * Returns true if the abbreviation is known, false if not.
+ * If the abbreviation is known and has a single meaning (only one value
+ * of gmtoff/isdst), sets *isfixed = true and sets *gmtoff and *isdst.
+ * If there are multiple meanings, sets *isfixed = false.
+ *
+ * Note: abbrev is matched case-sensitively; it should be all-upper-case.
+ */
+bool
+pg_timezone_abbrev_is_known(const char *abbrev,
+							bool *isfixed,
+							long int *gmtoff,
+							int *isdst,
+							const pg_tz *tz)
+{
+	bool		result = false;
+	const struct state *sp = &tz->state;
+	const char *abbrs;
+	int			abbrind;
+
+	/*
+	 * Locate the abbreviation in the zone's abbreviation list.  We assume
+	 * there are not duplicates in the list.
+	 */
+	abbrs = sp->chars;
+	abbrind = 0;
+	while (abbrind < sp->charcnt)
+	{
+		if (strcmp(abbrev, abbrs + abbrind) == 0)
+			break;
+		while (abbrs[abbrind] != '\0')
+			abbrind++;
+		abbrind++;
+	}
+	if (abbrind >= sp->charcnt)
+		return false;			/* definitely not there */
+
+	/*
+	 * Scan the ttinfo array to find uses of the abbreviation.
+	 */
+	for (int i = 0; i < sp->typecnt; i++)
+	{
+		const struct ttinfo *ttisp = &sp->ttis[i];
+
+		if (ttisp->tt_desigidx == abbrind)
+		{
+			if (!result)
+			{
+				/* First usage */
+				*isfixed = true;	/* for the moment */
+				*gmtoff = ttisp->tt_utoff;
+				*isdst = ttisp->tt_isdst;
+				result = true;
+			}
+			else
+			{
+				/* Second or later usage, does it match? */
+				if (*gmtoff != ttisp->tt_utoff ||
+					*isdst != ttisp->tt_isdst)
+				{
+					*isfixed = false;
+					break;		/* no point in looking further */
+				}
+			}
+		}
+	}
+
+	return result;
+}
+
 /*
  * If the given timezone uses only one GMT offset, store that offset
  * into *gmtoff and return true, else return false.
-- 
2.43.5