From 665485e4096feb68246f0d29d19c687eaaeae442 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B8rnar=20Ness?= <bjornar.ness@gmail.com>
Date: Thu, 5 Mar 2020 19:59:39 +0100
Subject: [PATCH] feature: allocate message_id in mail from: phase

Added EXPERIMENTAL_MAILFROM_MSGID to move generation of message_id
from DATA to MAIL FROM phase. This will allow logging of errors
in RCPT phase, referencing a potential future message_id
---
 src/src/EDITME            |   3 +
 src/src/config.h.defaults |   1 +
 src/src/functions.h       |   1 +
 src/src/receive.c         | 114 +++-----------------------------------
 src/src/smtp_in.c         |   5 +-
 src/src/string.c          | 112 +++++++++++++++++++++++++++++++++++++
 6 files changed, 128 insertions(+), 108 deletions(-)

diff --git a/src/src/EDITME b/src/src/EDITME
index 8d8552346a..c088619da5 100644
--- a/src/src/EDITME
+++ b/src/src/EDITME
@@ -635,6 +635,9 @@ DISABLE_MAL_MKS=yes
 # Uncomment the following to include the fast-ramp two-phase-queue-run support
 # EXPERIMENTAL_QUEUE_RAMP=yes
 
+# Uncomment the following line to enable message_id generation in mail from:
+# EXPERIMENTAL_MAILFROM_MSGID=yes
+
 ###############################################################################
 #                 THESE ARE THINGS YOU MIGHT WANT TO SPECIFY                  #
 ###############################################################################
diff --git a/src/src/config.h.defaults b/src/src/config.h.defaults
index 9d77f30544..a2b55f6351 100644
--- a/src/src/config.h.defaults
+++ b/src/src/config.h.defaults
@@ -202,6 +202,7 @@ Do not put spaces between # and the 'define'.
 #define EXPERIMENTAL_DCC
 #define EXPERIMENTAL_DSN_INFO
 #define EXPERIMENTAL_LMDB
+#define EXPERIMENTAL_MAILFROM_MSGID
 #define EXPERIMENTAL_QUEUE_RAMP
 #define EXPERIMENTAL_QUEUEFILE
 #define EXPERIMENTAL_SRS
diff --git a/src/src/functions.h b/src/src/functions.h
index df4b336069..363c5223f0 100644
--- a/src/src/functions.h
+++ b/src/src/functions.h
@@ -220,6 +220,7 @@ extern uschar *event_raise(uschar *, const uschar *, uschar *);
 extern void    msg_event_raise(const uschar *, const address_item *);
 #endif
 
+extern void    generate_message_id(void);
 extern int     exim_chown_failure(int, const uschar*, uid_t, gid_t);
 extern const uschar * exim_errstr(int);
 extern void    exim_exit(int, const uschar *) NORETURN;
diff --git a/src/src/receive.c b/src/src/receive.c
index 6d20a5cdae..7cebbec3ed 100644
--- a/src/src/receive.c
+++ b/src/src/receive.c
@@ -1653,7 +1653,6 @@ int  error_rc = error_handling == ERRORS_SENDER
 	? errors_sender_rc : EXIT_FAILURE;
 int  header_size = 256;
 int  start, end, domain;
-int  id_resolution = 0;
 int  had_zero = 0;
 int  prevlines_length = 0;
 
@@ -1742,7 +1741,9 @@ next->text = store_get(header_size, TRUE);	/* tainted */
 header names list to be the normal list. Indicate there is no data file open
 yet, initialize the size and warning count, and deal with no size limit. */
 
+#ifndef EXPERIMENTAL_MAILFROM_MSGID
 message_id[0] = 0;
+#endif
 spool_data_file = NULL;
 data_fd = -1;
 spool_name = US"";
@@ -1775,18 +1776,6 @@ if (smtp_input && !smtp_batched_input && !f.dkim_disable_verify)
 if (sender_host_address) dmarc_init();	/* initialize libopendmarc */
 #endif
 
-/* Remember the time of reception. Exim uses time+pid for uniqueness of message
-ids, and fractions of a second are required. See the comments that precede the
-message id creation below. */
-
-(void)gettimeofday(&message_id_tv, NULL);
-
-/* For other uses of the received time we can operate with granularity of one
-second, and for that we use the global variable received_time. This is for
-things like ultimate message timeouts. */
-
-received_time = message_id_tv;
-
 /* If SMTP input, set the special handler for timeouts. The alarm() calls
 happen in the smtp_getc() function when it refills its buffer. */
 
@@ -2609,83 +2598,11 @@ if (extract_recip)
 
   }
 
-/* Now build the unique message id. This has changed several times over the
-lifetime of Exim. This description was rewritten for Exim 4.14 (February 2003).
-Retaining all the history in the comment has become too unwieldy - read
-previous release sources if you want it.
-
-The message ID has 3 parts: tttttt-pppppp-ss. Each part is a number in base 62.
-The first part is the current time, in seconds. The second part is the current
-pid. Both are large enough to hold 32-bit numbers in base 62. The third part
-can hold a number in the range 0-3843. It used to be a computed sequence
-number, but is now the fractional component of the current time in units of
-1/2000 of a second (i.e. a value in the range 0-1999). After a message has been
-received, Exim ensures that the timer has ticked at the appropriate level
-before proceeding, to avoid duplication if the pid happened to be re-used
-within the same time period. It seems likely that most messages will take at
-least half a millisecond to be received, so no delay will normally be
-necessary. At least for some time...
-
-There is a modification when localhost_number is set. Formerly this was allowed
-to be as large as 255. Now it is restricted to the range 0-16, and the final
-component of the message id becomes (localhost_number * 200) + fractional time
-in units of 1/200 of a second (i.e. a value in the range 0-3399).
-
-Some not-really-Unix operating systems use case-insensitive file names (Darwin,
-Cygwin). For these, we have to use base 36 instead of base 62. Luckily, this
-still allows the tttttt field to hold a large enough number to last for some
-more decades, and the final two-digit field can hold numbers up to 1295, which
-is enough for milliseconds (instead of 1/2000 of a second).
-
-However, the pppppp field cannot hold a 32-bit pid, but it can hold a 31-bit
-pid, so it is probably safe because pids have to be positive. The
-localhost_number is restricted to 0-10 for these hosts, and when it is set, the
-final field becomes (localhost_number * 100) + fractional time in centiseconds.
-
-Note that string_base62() returns its data in a static storage block, so it
-must be copied before calling string_base62() again. It always returns exactly
-6 characters.
-
-There doesn't seem to be anything in the RFC which requires a message id to
-start with a letter, but Smail was changed to ensure this. The external form of
-the message id (as supplied by string expansion) therefore starts with an
-additional leading 'E'. The spool file names do not include this leading
-letter and it is not used internally.
-
-NOTE: If ever the format of message ids is changed, the regular expression for
-checking that a string is in this format must be updated in a corresponding
-way. It appears in the initializing code in exim.c. The macro MESSAGE_ID_LENGTH
-must also be changed to reflect the correct string length. The queue-sort code
-needs to know the layout. Then, of course, other programs that rely on the
-message id format will need updating too. */
-
-Ustrncpy(message_id, string_base62((long int)(message_id_tv.tv_sec)), 6);
-message_id[6] = '-';
-Ustrncpy(message_id + 7, string_base62((long int)getpid()), 6);
-
-/* Deal with the case where the host number is set. The value of the number was
-checked when it was read, to ensure it isn't too big. The timing granularity is
-left in id_resolution so that an appropriate wait can be done after receiving
-the message, if necessary (we hope it won't be). */
-
-if (host_number_string)
-  {
-  id_resolution = BASE_62 == 62 ? 5000 : 10000;
-  sprintf(CS(message_id + MESSAGE_ID_LENGTH - 3), "-%2s",
-    string_base62((long int)(
-      host_number * (1000000/id_resolution) +
-        message_id_tv.tv_usec/id_resolution)) + 4);
-  }
-
-/* Host number not set: final field is just the fractional time at an
-appropriate resolution. */
-
-else
-  {
-  id_resolution = BASE_62 == 62 ? 500 : 1000;
-  sprintf(CS(message_id + MESSAGE_ID_LENGTH - 3), "-%2s",
-    string_base62((long int)(message_id_tv.tv_usec/id_resolution)) + 4);
-  }
+#ifdef EXPERIMENTAL_MAILFROM_MSGID
+if (!smtp_input || smtp_batched_input) generate_message_id();
+#else
+generate_message_id();
+#endif /* EXPERIMENTAL_MAILFROM_MSGID */
 
 /* Add the current message id onto the current process info string if
 it will fit. */
@@ -4303,23 +4220,6 @@ then we can think about properly declaring the message not-received. */
 
 
 TIDYUP:
-/* In SMTP sessions we may receive several messages in one connection. After
-each one, we wait for the clock to tick at the level of message-id granularity.
-This is so that the combination of time+pid is unique, even on systems where the
-pid can be re-used within our time interval. We can't shorten the interval
-without re-designing the message-id. See comments above where the message id is
-created. This is Something For The Future.
-Do this wait any time we have created a message-id, even if we rejected the
-message.  This gives unique IDs for logging done by ACLs. */
-
-if (id_resolution != 0)
-  {
-  message_id_tv.tv_usec = (message_id_tv.tv_usec/id_resolution) * id_resolution;
-  exim_wait_tick(&message_id_tv, id_resolution);
-  id_resolution = 0;
-  }
-
-
 process_info[process_info_len] = 0;			/* Remove message id */
 if (spool_data_file && cutthrough_done == NOT_TRIED)
   {
diff --git a/src/src/smtp_in.c b/src/src/smtp_in.c
index 6062e81187..24a9034eb6 100644
--- a/src/src/smtp_in.c
+++ b/src/src/smtp_in.c
@@ -2415,7 +2415,7 @@ TCP_SYN_RCV (as of 12.1) so no idea about data-use. */
 
 if (getsockopt(fileno(smtp_out), IPPROTO_TCP, TCP_FASTOPEN, &is_fastopen, &len) == 0)
   {
-  if (is_fastopen) 
+  if (is_fastopen)
     {
     DEBUG(D_receive)
       debug_printf("TFO mode connection (TCP_FASTOPEN getsockopt)\n");
@@ -4900,6 +4900,9 @@ while (done <= 0)
       /* Apply an ACL check if one is defined, before responding. Afterwards,
       when pipelining is not advertised, do another sync check in case the ACL
       delayed and the client started sending in the meantime. */
+      #ifdef EXPERIMENTAL_MAILFROM_MSGID
+      generate_message_id();
+      #endif
 
       if (acl_smtp_mail)
 	{
diff --git a/src/src/string.c b/src/src/string.c
index 9f1aeb81db..b87168f238 100644
--- a/src/src/string.c
+++ b/src/src/string.c
@@ -1822,4 +1822,116 @@ return 0;
 }
 #endif
 
+/* Now build the unique message id. This has changed several times over the
+lifetime of Exim. This description was rewritten for Exim 4.14 (February 2003).
+Retaining all the history in the comment has become too unwieldy - read
+previous release sources if you want it.
+
+The message ID has 3 parts: tttttt-pppppp-ss. Each part is a number in base 62.
+The first part is the current time, in seconds. The second part is the current
+pid. Both are large enough to hold 32-bit numbers in base 62. The third part
+can hold a number in the range 0-3843. It used to be a computed sequence
+number, but is now the fractional component of the current time in units of
+1/2000 of a second (i.e. a value in the range 0-1999). After a message has been
+received, Exim ensures that the timer has ticked at the appropriate level
+before proceeding, to avoid duplication if the pid happened to be re-used
+within the same time period. It seems likely that most messages will take at
+least half a millisecond to be received, so no delay will normally be
+necessary. At least for some time...
+
+There is a modification when localhost_number is set. Formerly this was allowed
+to be as large as 255. Now it is restricted to the range 0-16, and the final
+component of the message id becomes (localhost_number * 200) + fractional time
+in units of 1/200 of a second (i.e. a value in the range 0-3399).
+
+Some not-really-Unix operating systems use case-insensitive file names (Darwin,
+Cygwin). For these, we have to use base 36 instead of base 62. Luckily, this
+still allows the tttttt field to hold a large enough number to last for some
+more decades, and the final two-digit field can hold numbers up to 1295, which
+is enough for milliseconds (instead of 1/2000 of a second).
+
+However, the pppppp field cannot hold a 32-bit pid, but it can hold a 31-bit
+pid, so it is probably safe because pids have to be positive. The
+localhost_number is restricted to 0-10 for these hosts, and when it is set, the
+final field becomes (localhost_number * 100) + fractional time in centiseconds.
+
+Note that string_base62() returns its data in a static storage block, so it
+must be copied before calling string_base62() again. It always returns exactly
+6 characters.
+
+There doesn't seem to be anything in the RFC which requires a message id to
+start with a letter, but Smail was changed to ensure this. The external form of
+the message id (as supplied by string expansion) therefore starts with an
+additional leading 'E'. The spool file names do not include this leading
+letter and it is not used internally.
+
+NOTE: If ever the format of message ids is changed, the regular expression for
+checking that a string is in this format must be updated in a corresponding
+way. It appears in the initializing code in exim.c. The macro MESSAGE_ID_LENGTH
+must also be changed to reflect the correct string length. The queue-sort code
+needs to know the layout. Then, of course, other programs that rely on the
+message id format will need updating too. */
+
+void
+generate_message_id()
+{
+int id_resolution = 0;
+
+/* Remember the time of reception. Exim uses time+pid for uniqueness of message
+ids, and fractions of a second are required. See the comments that precede the
+message id creation below. */
+
+(void)gettimeofday(&message_id_tv, NULL);
+
+/* For other uses of the received time we can operate with granularity of one
+second, and for that we use the global variable received_time. This is for
+things like ultimate message timeouts. */
+
+received_time = message_id_tv;
+
+Ustrncpy(message_id, string_base62((long int)(message_id_tv.tv_sec)), 6);
+message_id[6] = '-';
+Ustrncpy(message_id + 7, string_base62((long int)getpid()), 6);
+
+/* Deal with the case where the host number is set. The value of the number was
+checked when it was read, to ensure it isn't too big. The timing granularity is
+left in id_resolution so that an appropriate wait can be done after receiving
+the message, if necessary (we hope it won't be). */
+
+if (host_number_string)
+  {
+  id_resolution = BASE_62 == 62 ? 5000 : 10000;
+  sprintf(CS(message_id + MESSAGE_ID_LENGTH - 3), "-%2s",
+    string_base62((long int)(
+      host_number * (1000000/id_resolution) +
+        message_id_tv.tv_usec/id_resolution)) + 4);
+  }
+
+/* Host number not set: final field is just the fractional time at an
+appropriate resolution. */
+
+else
+  {
+  id_resolution = BASE_62 == 62 ? 500 : 1000;
+  sprintf(CS(message_id + MESSAGE_ID_LENGTH - 3), "-%2s",
+    string_base62((long int)(message_id_tv.tv_usec/id_resolution)) + 4);
+  }
+
+/* In SMTP sessions we may receive several messages in one connection. After
+each one, we wait for the clock to tick at the level of message-id granularity.
+This is so that the combination of time+pid is unique, even on systems where the
+pid can be re-used within our time interval. We can't shorten the interval
+without re-designing the message-id. See comments above where the message id is
+created. This is Something For The Future.
+Do this wait any time we have created a message-id, even if we rejected the
+message.  This gives unique IDs for logging done by ACLs. */
+
+if (id_resolution != 0)
+  {
+  message_id_tv.tv_usec = (message_id_tv.tv_usec/id_resolution) * id_resolution;
+  exim_wait_tick(&message_id_tv, id_resolution);
+  id_resolution = 0;
+  }
+}
+
 /* End of string.c */