6#include <stdexcept> // std::invalid_argument
9#include <arpa/inet.h> // Used only for checking for valid IP addresses in domain literals (inet_pton)
13 // --------------------------------------------------------------------------
14 // Constants that list sets of valid characters, which are optimized to test
15 // for ranges of the most commonly-used characters first during parsing, are
16 // named consistently with their respective rule names as defined in RFC2822.
18 // CRLF and \ are invisible in the quoted string according to RFC2822 section
21 // RFC2822 section 3.2.5 also defines a "quoted-string" as containing the
22 // following valid characters (spaces are also permitted): 33, 35-91, 93-126
23 // Certain characters must be quoted first though, and every character
24 // following a backslash is taken literally (and the backslash is removed
27 // RFC2822 section 3.2.4 defines an "atom" as containing the following valid
28 // characters: 0123456789
29 // ABCDEFGHIJKLMNOPQRSTUVWXYZ
30 // abcdefghijklmnopqrstuvwxyz
31 // !#$%&'*+-/=?^_`{|}~
33 // Quote characters and quotation marks are not permitted in the domain part.
35 // According to RFC2822 section 3.2.5, a phrase (DisplayName / Comments) can
36 // be either an atom (ATEXT) or quoted-text (QTEXT).
38 // According to RFC2822 section 2.2.2, whitespace characters are tabs (ASCII
39 // character 9) and spaces (ASCII character 32).
41 // RFC2822 section 3.4 last paragraph indicates that a group construct is
42 // optional, and preceeded by a colon following any number of comma-delimited
43 // recipients (including zero or one). Group constructs must end with a
45 // --------------------------------------------------------------------------
47 // --------------------------------------------------------------------------
48 // The following macros are optimized for performance by testing for the most
49 // commonly-used characters first.
52 // 94...126 ^_`abcdefghijklmnopqrstuvwxyz{|}~
53 // 65...90 ABCDEFGHIJKLMNOPQRSTUVWXYZ
54 // 47...57 /0123456789
59 // --------------------------------------------------------------------------
60 #define ATEXT(a) ((a >= 94 && a <= 126) \
61 || (a >= 65 && a <= 90) \
62 || (a >= 47 && a <= 57) \
63 || a == 45 || a == 33 \
64 || (a >= 35 && a <= 39) \
65 || a == 42 || a == 43 \
66 || a == 61 || a == 63 )
68 // --------------------------------------------------------------------------
69 // ATEXT_OBS ("obsolete standard" is ATEXT plus periods, spaces, and tabs)
70 // 94...126 ^_`abcdefghijklmnopqrstuvwxyz{|}~
71 // 65...90 ABCDEFGHIJKLMNOPQRSTUVWXYZ
72 // 45...57 -./0123456789
73 // 32 | 33 {space:32}!
78 // --------------------------------------------------------------------------
79 #define ATEXT_OBS(a) ((a >= 94 && a <= 126) \
80 || (a >= 65 && a <= 90) \
81 || (a >= 45 && a <= 57) \
82 || a == 32 || a == 33 \
83 || (a >= 35 && a <= 39) \
84 || a == 42 || a == 43 \
85 || a == 61 || a == 63 \
88 // --------------------------------------------------------------------------
89 // CTEXT (comment text)
90 // 93...126 ]^_`abcdefghijklmnopqrstuvwxyz{|}~
91 // 42...91 *+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[
93 // --------------------------------------------------------------------------
94 #define CTEXT(a) ((a >= 93 && a <= 126) \
95 || (a >= 42 && a <= 91) \
96 || (a >= 33 && a <= 39) )
98 // --------------------------------------------------------------------------
99 // CTEXT_WSP (comment text with white space)
100 // 93...126 ]^_`abcdefghijklmnopqrstuvwxyz{|}~
101 // 42...91 *+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[
102 // 32...39 {space:32}!"#$%&'
104 // --------------------------------------------------------------------------
105 #define CTEXT_WSP(a) ((a >= 93 && a <= 126) \
106 || (a >= 42 && a <= 91) \
107 || (a >= 32 && a <= 39) \
110 // --------------------------------------------------------------------------
111 // CTEXT_OBS (obsolete comment text)
115 // --------------------------------------------------------------------------
116 #define CTEXT_OBS(a) (CTEXT(a) \
120 // --------------------------------------------------------------------------
121 // DTEXT (domain-part, not including characters needed for domain-literals)
122 // 94...126 ^_`abcdefghijklmnopqrstuvwxyz{|}~
123 // 33...90 !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ
124 // --------------------------------------------------------------------------
125 #define DTEXT(a) ((a >= 94 && a <= 126) \
126 || (a >= 33 && a <= 90) )
128 // --------------------------------------------------------------------------
129 // FWS (folding white space)
133 // --------------------------------------------------------------------------
134 #define FWS(a) (a == 10 \
138 // --------------------------------------------------------------------------
139 // QTEXT (quoted text)
140 // 93...126 ]^_`abcdefghijklmnopqrstuvwxyz{|}~
141 // 35...91 #$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[
144 // --------------------------------------------------------------------------
145 #define QTEXT(a) ((a >= 93 && a <= 126) \
146 || (a >= 35 && a <= 91) \
147 || a == 32 || a == 33 )
149 // --------------------------------------------------------------------------
151 // 14...127 {char:14-31}{space:32}!"#$%&'()*+,-./0123456789:;<=>
152 // ?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^
153 // _`abcdefghijklmnopqrstuvwxyz{|}~
155 // 1...9 {char:1-6}{beep:7}{backspace:8}{tab:9}
156 // 11 | 12 {char:11}{char:12}
157 // --------------------------------------------------------------------------
158 #define TEXT(a) ((a >= 14 && a <= 127) \
159 || (a >= 1 && a <= 9) \
160 || a == 11 || a == 12 )
162 // --------------------------------------------------------------------------
166 // --------------------------------------------------------------------------
167 #define WSP(a) (a == 32 \
170 // --------------------------------------------------------------------------
171 // Used by the set() method to consistently reset internal variables when
172 // moving onward to the next token.
174 // token_begin: Configures beginning of next token.
175 // --------------------------------------------------------------------------
176 #define RESET_FOR_NEXT_TOKEN \
178 flag_quote = false; \
179 token_begin = offset + 1; \
183 /*======================================================================*//**
185 This @ref rmailaddr class provides an object-oriented eMail address.
189 Some of the key features are:
191 - constructors with sensible defaults help to simplify coding
192 - documentation includes code samples (with @c \#include lines as needed)
193 - can handle ASCIIZ without needing to specify string length
194 - can handle @c std::string (which tracks its own string length)
198 Validation of the format of an eMail address is helpful in ensuring that
199 eMail addresses received from elsewhere comply with internet standards.
203 I created this class to make it easier to write internet server daemons and
204 other software that needs to accept and/or handle eMail addresses. (This is
205 a complete re-write of the version I wrote in Java 17 years ago in 2007,
206 which includes a significant array of differences due to the improved parsing
207 approaches I use now that are more efficient, and the need to make sure that
208 UTF-8 characters and punycode are both handled in a transparent manner.)
212 @author Randolf Richardson
215 - 2024-May-07 v1.00 Initial version
216 - 2025-Feb-03 v1.00 Increased use of references and pointers
219 Lower-case letter "m" is regularly used in partial example code to represent
220 an instantiated rmailaddr object.
222 An ASCIIZ string is a C-string (char* array) that includes a terminating null
223 (0) character at the end.
227 I use the term "ASCIIZ string" to indicate an array of characters that's
228 terminated by a 0 (a.k.a., null). Although this is very much the same as a
229 C-string, the difference is that in many API functions a C-string must often
230 be accompanied by its length value. When referring to an ASCIIZ string, I'm
231 intentionally indicating that the length of the string is not needed because
232 the string is null-terminated. (This term was also commonly used in assembly
233 language programming in the 1970s, 1980s, and 1990s, and as far as I know is
234 still used by machine language programmers today.)
239 #include <iostream> // std::cout, std::cerr, std::endl, etc.
240 #include <stdexcept> // std::invalid_argument exception
242 #include <randolf/rmailaddr>
244 int main(int argc, char *argv[]) {
246 randolf::rmailaddr m("nobody@example.com");
247 } catch (const std::invalid_argument e) {
248 std::cerr << "eMail address format exception: " << e.what() << std::endl;
250 } catch (const std::exception e) {
251 std::cerr << "Other exception: " << e.what() << std::endl;
255 } // -x- int main -x-
258 Parameter stacking is supported (with methods that return @c rmailaddr*); in
259 this example, notice that semicolons (";") and "e." references are omittted
260 (when compared with the above):
263 #include <iostream> // std::cout, std::cerr, std::endl, etc.
264 #include <stdexcept> // std::invalid_argument exception
266 #include <randolf/rmailaddr>
268 int main(int argc, char *argv[]) {
270 randolf::rmailaddr m("nobody@example.com");
271 } catch (const std::invalid_argument e) {
272 std::cerr << "eMail address format exception: " << e.what() << std::endl;
274 } catch (const std::exception e) {
275 std::cerr << "Other exception: " << e.what() << std::endl;
279 } // -x- int main -x-
281 *///=========================================================================
285 /*======================================================================*//**
287 Structure of errors (only used when exceptions are disabled).
289 @see policy_throw_exceptions
290 *///=========================================================================
294 /// Offset (0 = position of first byte)
296 }; // -x- struct error_data -x-
299 /*======================================================================*//**
301 Structure of positions within the original eMail string where a portion
302 begins, and its length (in bytes), along with various other information about
305 This is used internally, and std::vector<mail_addr_token> organizes them and
306 looks after freeing memory.
307 *///=========================================================================
308 struct mail_addr_token {
310 /// g = group name (beginning; includes colon)
311 /// ; = group termination (semi-colon character)
313 /// e = eMail address (includes angle brackets, if present)
317 /// \0 = not initialized (null can effectively be regarded as meaning "unknown")
319 /// Offset, within the string, where this part begins
320 unsigned int offset = 0;
321 /// Total number of bytes
322 unsigned int len = 0;
323 /// Whether any UTF-8 characters are present in this part
324 bool flag_utf8 = false;
325 /// Whether this part is in punycode (begins with "xn--")
326 bool flag_punycode = false; // TODO
327 /// Whether this part is "obsolete" (according to RFCs)
328 bool flag_obsolete = false; // TODO
329 /// Whether eMail address was enclosed in angle brackets (type "e" only)
330 bool flag_angle = false;
331 /// Whether the token was enclosed in quotation marks
332 bool flag_quotes = false;
333 /// Whether eMail address is a null address enclosed in angle brackets (type "e" only)
334 bool flag_null_addr = false;
335 /// Whether the domain-part is an FQDN (type "d" only)
336 bool flag_fqdn = false; // TODO
337 /// Whether the domain-part is a domain-literal (type "d" only)
338 bool flag_domain_literal = false; // TODO
339 /// Depth of groups (types "g" and ";" only)
340 unsigned short depth = 0;
341 /// Processed data, with quotation marks, angle brackets, comments, whitespace, etc., removed
342 std::u8string p_token;
343 /// Index to display-name (type "e" only)
344 int index_display_name = -1;
345 /// Index to local-part (type "e" only)
346 int index_local_part = -1;
347 /// Index to domain-part (type "e" only)
348 int index_domain_part = -1;
349 }; // -x- struct mail_addr_token -x-
351 // --------------------------------------------------------------------------
352 // Internal variables.
353 // --------------------------------------------------------------------------
354 std::u8string _addr; // Original eMail address
355 std::vector<mail_addr_token> _tokens; // All eMail address tokens
356 std::vector<int> _index_e; // Index of type "e" records in _tokens
357 std::vector<error_data> _errors; // Error tracking
358 short group_depth = 0; // Recursive group tracking
359 bool angle_bracket_mode = false; // Angle-bracket mode tracking
360 bool quote_mode = false; // Quotation-marks mode tracking
362 // --------------------------------------------------------------------------
364 // --------------------------------------------------------------------------
365 bool _policy_keep_comments = false; // Wether to retain comments embedded in eMail addresses
366 bool _policy_throw_exceptions = true; // TRUE = throw exceptions; FALSE = save internally
367 bool _policy_tabs_to_spaces = false; // Whether to convert every tab into a space
368 bool _policy_support_utf8 = true; // Whether to support UTF-8 (FALSE = 7bit characters only)
370 /*======================================================================*//**
372 *///=========================================================================
375 const std::string& message,
376 /// Offset (0 = position of first byte)
378 if (_policy_throw_exceptions) throw std::invalid_argument(message + " at offset " + std::to_string(offset));
379 _errors.push_back({ message, offset });
381 } // -x- void _exception -x-
384 /*======================================================================*//**
386 Instantiate an empty rmailaddr that doesn't qualify as a properly-formatted
387 internet eMail address (because the minimum length of a valid internet eMail
388 address is 1 character).
390 Instantiating an empty rmailaddr is particularly useful for header-file
391 definitions; for example:
393 #include <iostream> // std::cout, std::cerr, std::endl, etc.
394 #include <stdexcept> // std::invalid_argument exception
396 #include <randolf/rmailaddr>
398 randolf::rmailaddr m; // <-- Empty rmailaddr initialization (no exceptions)
400 int main(int argc, char *argv[]) {
402 m.set("nobody@example.com");
403 } catch (const std::invalid_argument e) {
404 std::cerr << "eMail address format exception: " << e.what() << std::endl;
406 } catch (const std::exception e) {
407 std::cerr << "Other exception: " << e.what() << std::endl;
411 } // -x- int main -x-
413 *///=========================================================================
414 rmailaddr() noexcept {} // -x- constructor rmailaddr -x-
416 /*======================================================================*//**
418 Instantiate an rmailaddr that qualifies as a properly-formatted internet
419 eMail address (if it doesn't qualify, then an exception will be thrown).
423 #include <iostream> // std::cout, std::cerr, std::endl, etc.
424 #include <stdexcept> // std::invalid_argument exception
426 #include <randolf/rmailaddr>
428 int main(int argc, char *argv[]) {
430 randolf::rmailaddr m("nobody@example.com");
431 } catch (const std::invalid_argument e) {
432 std::cerr << "eMail address format exception: " << e.what() << std::endl;
434 } catch (const std::exception e) {
435 std::cerr << "Other exception: " << e.what() << std::endl;
439 } // -x- int main -x-
441 @throws std::invalid_argument describing the problem, along with the byte
442 offset where the problem originated from
444 *///=========================================================================
446 /// RFC-compliant eMail address
447 const char8_t* mailbox,
448 /// Number of characters (-1 = ASCIIZ string)
449 const int len = -1) {
451 } // -x- constructor rmailaddr -x-
453 /*======================================================================*//**
454 @copydoc rmailaddr(const char8_t*, int)
456 *///=========================================================================
458 /// RFC-compliant eMail address
460 /// Number of characters (-1 = ASCIIZ string)
461 const int len = -1) {
462 set((char8_t*)mailbox, len);
463 } // -x- constructor rmailaddr -x-
465 /*======================================================================*//**
466 @copydoc rmailaddr(const char8_t*, int)
468 *///=========================================================================
470 /// RFC-compliant eMail address
471 const std::string& mailbox) {
472 set((char8_t*)mailbox.data(), mailbox.size());
473 } // -x- constructor rmailaddr -x-
475 /*======================================================================*//**
476 @copydoc rmailaddr(const char8_t*, int)
478 *///=========================================================================
480 /// RFC-compliant eMail address
481 const std::u8string& mailbox) {
482 set(mailbox.data(), mailbox.size());
483 } // -x- constructor rmailaddr -x-
485 /*======================================================================*//**
487 Access only the eMail address, without display-name, and without any sets of
488 enclosing quotation-marks or enclosing angle-brackets, etc.
494 @throws std::out_of_range if the index is out-of-range
495 @returns std::string with only the eMail address (no display-name, and no
496 enclosing sets of quotation-marks or enclosing angle-brackets, etc.)
497 *///=========================================================================
499 /// Index of eMail address to query for (0 = first element; negative index
500 /// values are calculated in reverse, starting with -1 as the final position)
501 const int index = 0) {
502// return std::string((char*)_tokens[_index_e[index >= 0 ? index : _index_e.size() + index]].p_token.c_str());
503 return std::string((char*)_tokens.at(_index_e.at(index >= 0 ? index : _index_e.size() + index)).p_token.c_str());
504 } // -x- std::string addr -x-
506 /*======================================================================*//**
508 Access an eMail address's display-name (the portion preceding the angle
509 brackets). If there were no angle-brackets, then an empty string will
516 @returns std::string with only the display-name (no quotation marks, etc.)
517 *///=========================================================================
518 std::string display_name(
519 /// Index of eMail address to query for (0 = first element; negative index
520 /// values are calculated in reverse, starting with -1 as the final position)
521 const int index = 0) {
522 return std::string((char*)_tokens[_tokens[_index_e[index >= 0 ? index : _index_e.size() + index]].index_display_name].p_token.c_str());
523 } // -x- std::string display_name -x-
525 /*======================================================================*//**
527 Access an eMail address's domain-part (the portion following the @c @ sign).
534 @returns std::string with only the domain-part (no angle brackets, etc.)
535 *///=========================================================================
536 std::string domain_part(
537 /// Index of eMail address to query for (0 = first element; negative index
538 /// values are calculated in reverse, starting with -1 as the final position)
540 return std::string((char*)_tokens[_tokens[_index_e[index >= 0 ? index : _index_e.size() + index]].index_domain_part].p_token.c_str());
541 } // -x- std::string domain_part -x-
543 /*======================================================================*//**
545 Access an eMail address (enclosed in angle-brackets), and preceded by the
546 display-name (if one is available).
549 If the original form of the display-name had a delimiting space before the
550 eMail address, then that space will be present in the result here. If not, a
551 space will not be inserted. (In other words, this aspect of the original
552 full eMail address will be retained.)
558 @returns std::string with display-name and eMail address (in angle-brackets)
559 *///=========================================================================
561 /// Index of eMail address to query for (0 = first element; negative index
562 /// values are calculated in reverse, starting with -1 as the final position)
565 // --------------------------------------------------------------------------
566 // The eMail address has no display-name because it wasn't enclosed in angle
567 // brackets, so present the eMail address on its own, in angle brackets.
568 // --------------------------------------------------------------------------
569 mail_addr_token e = _tokens[_index_e[index >= 0 ? index : _index_e.size() + index]];
570 if (e.index_display_name < 0)
571 return "<" + std::string((char*)e.p_token.c_str()) + ">";
573 // --------------------------------------------------------------------------
574 // There was a display-name, so return the eMail address with display-name
575 // (enclosed in quotation marks if it started out that way).
576 // --------------------------------------------------------------------------
577 mail_addr_token n = _tokens[e.index_display_name];
580 + std::string((char*)n.p_token.c_str())
583 + std::string((char*)e.p_token.c_str())
586 return std::string((char*)n.p_token.c_str())
588 + std::string((char*)e.p_token.c_str())
591 } // -x- std::string email -x-
593 /*======================================================================*//**
595 Find out if this object doesn't hold any eMail addresses.
600 @returns TRUE = no eMail addresses@n
601 FALSE = one or more eMail addresses
602 *///=========================================================================
604 return _index_e.empty();
605 } // -x- bool empty -x-
607 /*======================================================================*//**
609 Return a list of errors that have been collected (instead of throwing
612 @see policy_throw_exceptions
613 @returns Vector containing @c error_data
614 *///=========================================================================
615 std::vector<error_data> errors() {
617 } // -x- std::vector<error-data> errors -x-
619 /*======================================================================*//**
621 Clear the list of errors that have been collected (instead of throwing
624 @see policy_throw_exceptions
625 @returns The same rmailaddr object so as to facilitate stacking
626 *///=========================================================================
627 rmailaddr& errors_clear() {
630 } // -x- rmailaddr& errors_clear -x-
632 /*======================================================================*//**
634 Grade an eMail address, similar to traditional elementary school grades. For
635 simplicity, grades "a" through "c" are passes, while grades "d" through "f"
636 are failures, although if less strict then "d" should also qualify as a pass.
640 a = Angle-brackets surrounding eMail address (optional display-name)
641 b = Bare eMail address (no display-name)
642 c = Complex eMail address (groups; optional angle-brackets; optional display-name)
643 d = Defective (because obsolete RFC standards were utilized)
644 e = Errors (only when collecting errors instead of throwing exceptions)
645 f = Failure (an exception was thrown, or eMail address is blank)
648 To test for a pass, use a comparison such as <tt>m.grade() <= 'c'</tt>
649 (strict) or <tt>m.grade() <= 'd'</tt> (not strict).
651 *///=========================================================================
653 /// eMail address index (default is 0 for the first eMail address)
654 const int index = 0) {
656 } // -x- char grade -x-
658 /*======================================================================*//**
660 Indicates whether a display-name was included with this eMail address.
661 @returns TRUE = eMail address includes a display-name@n
662 FALSE = eMail address has no display-name
663 *///=========================================================================
664 bool has_display_name(
665 /// eMail address index (default is 0 for the first eMail address)
666 const int index = 0) {
667 return _tokens[_index_e[index >= 0 ? index : _index_e.size() + index]].index_display_name != -1;
668 } // -x- bool has_display-name -x-
670 /*======================================================================*//**
672 Find out whether this object holds any number of eMail addresses. If there
673 are no eMail addresses, then this method returns @c FALSE.
678 @returns TRUE = one or more eMail addresses@n
679 FALSE = no eMail addresses
680 *///=========================================================================
682 return _index_e.size() > 0;
683 } // -x- bool has_any -x-
685 /*======================================================================*//**
687 Find out whether this object holds multiple eMail addresses. If there is
688 only one eMail address, or no eMail addresses at all, then this method
694 @returns TRUE = two or more eMail addresses@n
695 FALSE = one eMail address@n
696 FALSE = no eMail addresses
697 *///=========================================================================
698 bool has_multiple() {
699 return _index_e.size() > 1;
700 } // -x- bool has_multiple -x-
702 /*======================================================================*//**
704 Find out whether this object holds exactly one eMail address. If there are
705 two or more eMail addresses, or no eMail addresses, then this method returns
711 @returns TRUE = exactly one eMail address@n
712 FALSE = two or more eMail addresses@n
713 FALSE = no eMail addresses
714 *///=========================================================================
716 return _index_e.size() == 1;
717 } // -x- bool has_one -x-
719 /*======================================================================*//**
721 Find out the state of this policy.
722 @see policy_keep_comments
723 @returns policy status
724 *///=========================================================================
725 bool is_policy_keep_comments() {
726 return _policy_keep_comments;
727 } // -x- bool is_policy_keep_comments -x-
729 /*======================================================================*//**
731 Find out the state of this policy.
732 @see policy_tabs_to_spaces
733 @returns policy status
734 *///=========================================================================
735 bool is_policy_tabs_to_spaces() {
736 return _policy_tabs_to_spaces;
737 } // -x- bool is_policy_tabs_to_spaces -x-
739 /*======================================================================*//**
741 Find out the state of this policy.
742 @see policy_throw_exceptions
743 @returns policy status
744 *///=========================================================================
745 bool is_policy_throw_exceptions() {
746 return _policy_throw_exceptions;
747 } // -x- bool is_policy_throw_exceptions -x-
749 /*======================================================================*//**
751 Find out the state of this policy.
752 @see policy_support_utf8
753 @returns policy status
754 *///=========================================================================
755 bool is_policy_support_utf8() {
756 return _policy_support_utf8;
757 } // -x- bool is_policy_support_utf8 -x-
759 /*======================================================================*//**
761 Indicates whether this is just an eMail address, without any other parts such
762 as display-name, group constructs, comments, etc.
763 @returns TRUE = eMail address includes a display-name@n
764 FALSE = eMail address has no display-name
765 *///=========================================================================
767 /// indicate whether angle-brackets are okay (default is FALSE so that the
768 /// meaning of the word "pure" is not tainted)
769 const bool angle_flag = false,
770 /// eMail address index (default is 0 for the first eMail address)
771 const int index = 0) {
772//TODO: Finish this (we need to consider groups, display-name, comments, etc.)
773 return _tokens[_index_e[index >= 0 ? index : _index_e.size() + index]].index_display_name != -1;
774 } // -x- bool is_pure -x-
776 /*======================================================================*//**
778 Access an eMail address's local-part (the portion preceding the @c @ sign).
784 @returns std::string with only the local-part (no angle brackets, etc.)
785 *///=========================================================================
786 std::string local_part(
787 /// Index of eMail address to query for (0 = first element; negative index
788 /// values are calculated in reverse, starting with -1 as the final position)
790 return std::string((char*)_tokens[_tokens[_index_e[index >= 0 ? index : _index_e.size() + index]].index_local_part].p_token.c_str());
791 } // -x- std::string local_part -x-
793 /*======================================================================*//**
795 Sets the policy for whether to keep comments that were embedded in eMail
796 address group-construct, display-name, and local-part portions.
798 Comments are excluded by default because most systems don't need them, but in
799 the event that they are needed (or wanted), this policy makes it possible to
800 make sure they aren't excluded during processing.
802 @see is_policy_keep_comments
803 @returns The same rmailaddr object so as to facilitate stacking
804 *///=========================================================================
805 rmailaddr& policy_keep_comments(
806 /// FALSE = do not retain comments embedded in eMail addresses (deafult)@n
807 /// TRUE = retain comments embedded in eMail addresses
810 // --------------------------------------------------------------------------
811 // Update internal policy.
812 // --------------------------------------------------------------------------
813 _policy_keep_comments = policy_flag;
815 // --------------------------------------------------------------------------
816 // Return this object to facilitate stacking.
817 // --------------------------------------------------------------------------
820 } // -x- rmailaddr& policy_keep_comments -x-
822 /*======================================================================*//**
824 Sets the policy for whether to support UTF-8 characters.
826 Some older systems may not be able to handle 8-bit data that UTF-8 utilizes,
827 in which case this policy makes it possible to easily reject incompatible
828 eMail addresses before attempting to use them with such systems.
830 @see is_policy_support_utf8
831 @returns The same rmailaddr object so as to facilitate stacking
832 *///=========================================================================
833 rmailaddr& policy_support_utf8(
834 /// TRUE = support UTF-8 characters in eMail addresses (deafult)@n
835 /// FALSE = do not support UTF-8 characters in eMail addresses
838 // --------------------------------------------------------------------------
839 // Update internal policy.
840 // --------------------------------------------------------------------------
841 _policy_support_utf8 = policy_flag;
843 // --------------------------------------------------------------------------
844 // Return this object to facilitate stacking.
845 // --------------------------------------------------------------------------
848 } // -x- rmailaddr& policy_support_utf8 -x-
850 /*======================================================================*//**
852 Sets the policy for whether to convert every tab character (ASCII charcter 9)
853 to a space (ASCII character 32). This conversion occurs only once when the
854 eMail address is initially specified in a constructor or by way of one of the
855 @ref set() methods (changing this policy after this point will not be applied
856 to the current eMail address, but it will be in effect for future calls to
857 any of the @ref set() methods).
859 There are some situations where a tab character can create problems, such as
860 when interacting with certain older software or software that makes incorrect
861 assumptions about how to parse an eMail address, and this policy makes it
862 easy to accomodate such situations for the tab character, which some users
863 may be including by using the tab key on their keyboards.
865 @see is_policy_tabs_to_spaces
866 @returns The same rmailaddr object so as to facilitate stacking
867 *///=========================================================================
868 rmailaddr& policy_tabs_to_spaces(
869 /// TRUE = convert every tab character to a space@n
870 /// FALSE = do not convert tab characters to spaces (default)
873 // --------------------------------------------------------------------------
874 // Update internal policy.
875 // --------------------------------------------------------------------------
876 _policy_tabs_to_spaces = policy_flag;
878 // --------------------------------------------------------------------------
879 // Return this object to facilitate stacking.
880 // --------------------------------------------------------------------------
883 } // -x- rmailaddr& policy_tabs_to_spaces -x-
885 /*======================================================================*//**
887 Sets the policy for whether to throw exceptions when an error is encountered.
889 When this flag is set, errors are tracked internally instead of throwing any
890 exceptions, and will need to be retrieved using the @ref errors() method,
891 which is useful for analyzing an eMail address. (Enabling or disabling this
892 flag does not erase the errors that are stored internally; you will need to
893 use the @ref errors_clear method for this.)
896 This policy is not meant for general use in the majority of applications; it
897 is intended for technical analysis, which would be useful in diagnostic and
898 research applications, or packet analysis applications like WireShark, or for
899 advanced users who are interested in more techincal detail.
902 @see is_policy_throw_exceptions
903 @returns The same rmailaddr object so as to facilitate stacking
904 *///=========================================================================
905 rmailaddr& policy_throw_exceptions(
906 /// TRUE = throw exceptions (default)@n
907 /// FALSE = don't throw exceptions
910 // --------------------------------------------------------------------------
911 // Update internal policy.
912 // --------------------------------------------------------------------------
913 _policy_throw_exceptions = policy_flag;
915 // --------------------------------------------------------------------------
916 // Return this object to facilitate stacking.
917 // --------------------------------------------------------------------------
920 } // -x- rmailaddr& policy_throw_exceptions -x-
922 /*======================================================================*//**
924 Set a new eMail address, resetting all internal flags, counters, and arrays
925 (but not changing any existing policies). Any existing eMail addresses will
926 be cleared out. (This method is also used internally by most of this class's
928 @throws std::invalid_argument describing the problem, along with the byte
929 offset where the problem originated from
931 @returns The same rmailaddr object so as to facilitate stacking
932 *///=========================================================================
934 /// RFC-compliant eMail address
936 /// Number of characters (-1 = ASCIIZ string)
938 set((char8_t*)mailbox, len);
940 } // -x- rmailaddr& set -x-
942 /*======================================================================*//**
943 @copydoc set(const char*, int)
945 @returns The same rmailaddr object so as to facilitate stacking
946 *///=========================================================================
948 /// RFC-compliant eMail address
949 const std::string& mailbox) {
950 set((char8_t*)mailbox.data(), mailbox.size());
952 } // -x- rmailaddr& set -x-
954 /*======================================================================*//**
955 @copydoc set(const char*, int)
957 @returns The same rmailaddr object so as to facilitate stacking
958 *///=========================================================================
960 /// RFC-compliant eMail address
961 const std::u8string& mailbox) {
962 set(mailbox.data(), mailbox.size());
964 } // -x- rmailaddr& set -x-
966 /*======================================================================*//**
967 @copydoc set(const char*, int)
969 @returns The same rmailaddr object so as to facilitate stacking
970 *///=========================================================================
972 /// RFC-compliant eMail address
973 const char8_t* mailbox,
974 /// Number of characters (-1 = ASCIIZ string)
977 // --------------------------------------------------------------------------
978 // Measure size of format string if an ASCIIZ string was indicated.
979 // --------------------------------------------------------------------------
980 if (len == -1) len = std::strlen((char*)mailbox);
982 // --------------------------------------------------------------------------
983 // Save a copy of the original eMail address.
984 // --------------------------------------------------------------------------
985 _addr.assign(mailbox, len); // We need to save this for later reference
987 // --------------------------------------------------------------------------
988 // Pre-adjustments (optional, as per policy flags).
989 // --------------------------------------------------------------------------
990 if (_policy_tabs_to_spaces) // Policy: Convert all tabs to spaces
991 _addr.replace(_addr.begin(), _addr.end(), '\t', ' '); // Efficient replacement
993 // --------------------------------------------------------------------------
994 // Internal variables.
995 // --------------------------------------------------------------------------
996 int offset = 0; // Offset within original mailbox char8_t[] array
997 int last_display_name = -1; // Used to build type "e" eMail tokens
998 int last_local_part = -1; // Used to build type "e" eMail tokens
999 int last_domain_part = -1; // Used to build type "e" eMail tokens
1001 // --------------------------------------------------------------------------
1002 // Internal variables that are reset or updated together at various times,
1003 // such as when a token is [in most cases] completed.
1004 // --------------------------------------------------------------------------
1005 int token_begin = 0; // Beginning offset within current portion of string being parsed
1006 char8_t ch; // Character being tested (this needs to be defined outside of the main loop)
1007 bool flag_utf8 = false; // UTF8 character(s) detected
1008 bool flag_angle = false; // Angle-bracket detected
1009 bool flag_quote = false; // Quotation-marks mode detected
1010 bool active_angle = false; // Angle-bracket mode is active
1011 bool active_at_sign = false; // At-sign mode is active (domain-part instead of local-part interpretation)
1012 bool active_quote = false; // Quotation-marks mode is active
1013 int comment_depth = 0; // Comments are active when this value is greater than 0 (too many closed comments are in the negative)
1014 std::u8string p_token; // Processed token data (angle brackets, quotation marks, comments, and whitespace omitted)
1015 std::u8string p_token_sp; // Processed token data (angle brackets, quotation marks, and comments omitted), with spaces preserved
1017 // --------------------------------------------------------------------------
1018 // Main parsing loop that identifies tokens and ensures compliance, and also
1019 // effectively pre-processes eMail addresses on-the-fly for faster access
1020 // from the _emails vector later.
1021 // --------------------------------------------------------------------------
1024 // --------------------------------------------------------------------------
1025 // Obtain next character.
1026 // --------------------------------------------------------------------------
1027 ch = mailbox[offset];
1029 // --------------------------------------------------------------------------
1030 // Compare one character at a time, but first process special cases of quoted
1031 // data (copy most of the data) and comments (ignore the data).
1032 // --------------------------------------------------------------------------
1033 if (flag_quote && active_quote && ch != '"') {
1034 if (QTEXT(ch)) { // Include only quoted text
1035 p_token.push_back(ch);
1036 p_token_sp.push_back(ch);
1037 } // -x- if QTEXT -x-
1039 } else if (comment_depth > 0 && ch != ')') { // Ignore all comment data
1040 if (_policy_keep_comments) {
1041 p_token.push_back(ch);
1042 p_token_sp.push_back(ch);
1043 } // -x- if _policy_keep_comments -x-
1046 main_parsing_switch: switch (ch) {
1048 // --------------------------------------------------------------------------
1049 // Group name ends with a colon.
1050 // --------------------------------------------------------------------------
1052 if (!active_quote) { // Enable quotation-marks mode
1053 if (flag_quote) _exception("quotation-marks mode can't be re-opened", offset);
1054 active_quote = true;
1056 } else { // Disable quotation-marks mode
1057 active_quote = false;
1062 // --------------------------------------------------------------------------
1063 // Group name ends with a colon.
1064 // --------------------------------------------------------------------------
1067 // --------------------------------------------------------------------------
1068 // Internal tracking.
1069 // --------------------------------------------------------------------------
1072 // --------------------------------------------------------------------------
1073 // Add this token to the tokens vector.
1074 // --------------------------------------------------------------------------
1075 _tokens.push_back({ .type = 'g',
1076 .offset = token_begin,
1077 .len = offset - token_begin,
1078 .flag_utf8 = flag_utf8,
1079 .p_token = p_token_sp, });
1081 // --------------------------------------------------------------------------
1082 // Reset and prepare internal variables for the next token.
1083 // --------------------------------------------------------------------------
1084 RESET_FOR_NEXT_TOKEN;
1089 // --------------------------------------------------------------------------
1090 // Group of eMail addresses is terminated by a semi-colon.
1091 // --------------------------------------------------------------------------
1094 // --------------------------------------------------------------------------
1095 // Internal tracking.
1096 // --------------------------------------------------------------------------
1097 if (--group_depth < 0) _exception("too many group construct terminators", offset);
1098 if (active_angle) _exception("unbalanced open angle bracket", offset);
1100 // --------------------------------------------------------------------------
1101 // Add this token terminator to the tokens vector.
1102 // --------------------------------------------------------------------------
1103 _tokens.push_back({ .type = ';',
1104 .offset = token_begin,
1105 .len = offset - token_begin,
1106 .flag_utf8 = flag_utf8,
1107 .p_token = p_token_sp, });
1109 // --------------------------------------------------------------------------
1110 // Reset and prepare internal variables for the next token.
1111 // --------------------------------------------------------------------------
1112 RESET_FOR_NEXT_TOKEN;
1117 // --------------------------------------------------------------------------
1118 // Opening angle bracket.
1119 // --------------------------------------------------------------------------
1122 // --------------------------------------------------------------------------
1123 // Internal tracking.
1124 // --------------------------------------------------------------------------
1125 if (flag_angle) _exception("unbalanced open angle bracket", offset);
1126 active_angle = true;
1129 // --------------------------------------------------------------------------
1130 // Add this token terminator to the tokens vector if a display-name exists.
1131 // --------------------------------------------------------------------------
1132 if (token_begin < offset) {
1133 last_display_name = _tokens.size();
1134 _tokens.push_back({ .type = 'n',
1135 .offset = token_begin,
1136 .len = offset - token_begin,
1137 .flag_utf8 = flag_utf8,
1138 .p_token = p_token_sp, });
1139 } // -x- if token_begin -x-
1141 // --------------------------------------------------------------------------
1142 // Reset and prepare internal variables for the next token.
1143 // --------------------------------------------------------------------------
1144 RESET_FOR_NEXT_TOKEN;
1149 // --------------------------------------------------------------------------
1150 // At sign ("@") delimiter.
1151 // --------------------------------------------------------------------------
1154 // --------------------------------------------------------------------------
1155 // Internal tracking.
1156 // --------------------------------------------------------------------------
1157 if (active_at_sign) _exception("too many at (\"@\") signs", offset);
1158 active_at_sign = true;
1160 // --------------------------------------------------------------------------
1161 // Add this token terminator to the tokens vector if a display-name exists.
1162 // --------------------------------------------------------------------------
1163 last_local_part = _tokens.size();
1164 _tokens.push_back({ .type = 'l',
1165 .offset = token_begin,
1166 .len = offset - token_begin,
1167 .flag_utf8 = flag_utf8,
1168 .flag_angle = flag_angle,
1169 .p_token = p_token, });
1171 // --------------------------------------------------------------------------
1172 // Reset and prepare internal variables for the next token.
1173 // --------------------------------------------------------------------------
1174 RESET_FOR_NEXT_TOKEN;
1179 // --------------------------------------------------------------------------
1180 // Closing angle-bracket.
1181 // --------------------------------------------------------------------------
1184 // --------------------------------------------------------------------------
1185 // Internal tracking.
1186 // --------------------------------------------------------------------------
1187 if (!active_angle) _exception("unbalanced closing angle bracket", offset);
1188 active_angle = false;
1189 goto main_parsing_email;
1191 // --------------------------------------------------------------------------
1192 // Reset and prepare internal variables for the next token.
1193 // --------------------------------------------------------------------------
1194 RESET_FOR_NEXT_TOKEN;
1199 // --------------------------------------------------------------------------
1200 // Comma delimiter, signifies the end of an eMail address.
1201 // --------------------------------------------------------------------------
1205 // --------------------------------------------------------------------------
1206 // Internal tracking.
1207 // --------------------------------------------------------------------------
1208 if (active_quote) _exception("unbalanced quotation-marks", offset);
1209 if (active_angle) _exception("unbalanced open angle bracket before comma", offset);
1212 // --------------------------------------------------------------------------
1213 // Add this token terminator to the tokens vector if a display-name exists.
1214 // --------------------------------------------------------------------------
1215 if (active_at_sign) { // Domain-part has been started
1216 last_domain_part = _tokens.size();
1217 _tokens.push_back({ .type = 'd',
1218 .offset = token_begin,
1219 .len = offset - token_begin,
1220 .flag_utf8 = flag_utf8,
1221 .flag_angle = _tokens[last_local_part].flag_angle,
1222 .p_token = p_token, });
1223 active_at_sign = false;
1224 } else { // Domain-part has not been started, so there's only a local-part here
1225 last_local_part = _tokens.size();
1226 _tokens.push_back({ .type = 'l',
1227 .offset = token_begin,
1228 .len = offset - token_begin,
1229 .flag_utf8 = flag_utf8,
1230 .flag_angle = flag_angle,
1231 .p_token = p_token, });
1232 } // -x- if active_at_sign -x-
1234 // --------------------------------------------------------------------------
1235 // Perform a few checks to make sure we're not creating phantom addresses.
1236 // --------------------------------------------------------------------------
1237 int __email_len = last_domain_part == -1 ? _tokens[last_local_part].len : (_tokens[last_domain_part].offset - _tokens[last_local_part].offset) + _tokens[last_domain_part].len;
1238//std::cout << "__email_len=" << std::to_string(__email_len) << std::endl;
1239 if (__email_len == 0 && !flag_angle) continue;
1240//std::cout << "last_local_part=" << std::to_string(last_local_part) << std::endl;
1241//std::cout << "last_domain_part=" << std::to_string(last_domain_part) << std::endl;
1243 // --------------------------------------------------------------------------
1244 // Create a token of type "e" now that this eMail address is closed.
1246 // The reason we're calculating size based on offsets instead of by adding
1247 // sizes together (and adding 1 for the "@" sign) is that commants can be
1248 // included in the localpart portion, which normally won't be counted in any
1250 // --------------------------------------------------------------------------
1251 _index_e.push_back(_tokens.size()); // Add to index of eMail addresses (before adding to _tokens vector, _tokens.size() is the position)
1252 _tokens.push_back({ .type = 'e',
1253 .offset = _tokens[last_local_part].offset,
1254 .len = __email_len,// - token_begin,
1255 .flag_utf8 = _tokens[last_local_part].flag_utf8 || flag_utf8,
1256 .flag_angle = _tokens[last_local_part].flag_angle,
1257 .flag_null_addr = __email_len == 0,
1258 .p_token = _tokens[last_local_part].p_token + ((last_domain_part == -1 || _tokens[last_domain_part].p_token.empty()) ? u8"" : u8"@" + _tokens[last_domain_part].p_token),
1259 .index_display_name = last_display_name,
1260 .index_local_part = last_local_part,
1261 .index_domain_part = last_domain_part, });
1262 last_display_name = -1;
1263 last_local_part = -1;
1264 last_domain_part = -1;
1267 // --------------------------------------------------------------------------
1268 // Reset and prepare internal variables for the next token.
1269 // --------------------------------------------------------------------------
1270 RESET_FOR_NEXT_TOKEN;
1275 // --------------------------------------------------------------------------
1276 // Opening comment parenthesis.
1277 // --------------------------------------------------------------------------
1283 // --------------------------------------------------------------------------
1284 // Closing comment parenthesis.
1285 // --------------------------------------------------------------------------
1287 if (--comment_depth < 0) _exception("unbalanced closing comment parenthesis", offset);
1291 // --------------------------------------------------------------------------
1292 // Backslash (quote-literal).
1293 // --------------------------------------------------------------------------
1296 // --------------------------------------------------------------------------
1297 // Prevent a potential out-of-bounds buffer-overrun problem.
1298 // --------------------------------------------------------------------------
1299 if (++offset == len) {
1300 _exception("unbalanced quote-literal (backslash)", offset);
1301 continue; // Do this in case we're not throwing exceptions
1302 } // -x- if offset -x-
1304 // --------------------------------------------------------------------------
1305 // Update to next character (whatever it is, we're taking it literally).
1306 // --------------------------------------------------------------------------
1307 ch = mailbox[offset];
1308 goto main_parsing_loop_default; // Fall-through to default
1312 // --------------------------------------------------------------------------
1313 // All remaining characters.
1314 // --------------------------------------------------------------------------
1316 //if (flag_angle) _exception("additional data not permitted", offset);
1317 main_parsing_loop_default:
1318 if (ch > 127) { // Include all UTF-8 character (unless prevented by the exception)
1320 if (!_policy_support_utf8) _exception("UTF-8 byte encountered", offset);
1321 p_token.push_back(ch);
1322 p_token_sp.push_back(ch);
1323 } else if (CTEXT(ch) || ' ') { // Include almost everything for now (including spaces)
1324 if (ch != ' ') p_token.push_back(ch); // Exclude spaces
1325 if (!(ch == ' ' && p_token.size() == 0)) p_token_sp.push_back(ch); // Keep spaces
1328 } // -x- switch ch -x-
1330 } while (++offset < len); // -x- do while -x-
1332 // --------------------------------------------------------------------------
1333 // If the final token isn't empty (a.k.a., unfinished / not sealed), then
1334 // figure out what to do and run one more time, or else throw an exception.
1335 // --------------------------------------------------------------------------
1336 if (offset == len && token_begin < offset) {
1337 ch = ','; // Force comma (",") on parsing loop
1338 goto main_parsing_switch;
1339 } else if (offset > len && token_begin < offset) {
1340 _exception("incomplete data", offset - 1);
1341 } // -x- if offset -x-
1344 } // -x- rmailaddr& set -x-
1346 /*======================================================================*//**
1348 Find out how many eMail addresses this object holds.
1353 @returns The number of eMail addresses
1354 *///=========================================================================
1356 return _index_e.size();
1357 } // -x- int size -x-
1359 /*======================================================================*//**
1361 Generate a detailed output of all tokens that's useful for debugging.
1365 g = group name (beginning; includes colon)
1366 ; = group termination (semi-colon character)
1368 e = eMail address (includes angle brackets, if present)
1371 c = comment (not implemented)
1372 \0 = not initialized (null; regard as "unknown"; this should never happen)
1375 The difference between "token" and "p_token" is that "token" is the original
1376 and [mostly] unprocessed atom, while "p_token" has been processed with any
1377 sets of angle-brackets, sets of quotation-marks, comments, and whitespace
1378 removed. In nearly all instances, the value of "p_token" is what's needed.
1379 @returns std::string containing multi-line text (one token per line)
1380 *///=========================================================================
1381 std::string tokens_to_string(
1382 /// Filter (string containing characters for those types that are to be
1383 /// included {unrecognized types will be ignored}; the default is no filter)
1384 const std::string& filter = "",
1385 /// Prefix (text to insert before the beginning of each line)
1386 const std::string& prefix = "",
1387 /// End-of-Line sequence (default is "\n")
1388 const std::string& eol = "\n") {
1390 // --------------------------------------------------------------------------
1391 // Internal variables.
1392 // --------------------------------------------------------------------------
1395 // --------------------------------------------------------------------------
1396 // Loop that builds list of tokens (one per line).
1397 // --------------------------------------------------------------------------
1398 for (int i = 0; i < _tokens.size(); i++) {
1400 // --------------------------------------------------------------------------
1402 // --------------------------------------------------------------------------
1403 if (filter.empty() || filter.find(_tokens[i].type) != std::string::npos) {
1405 // --------------------------------------------------------------------------
1406 // Shared characteristics.
1407 // --------------------------------------------------------------------------
1408 t.append(prefix + "index=" + std::to_string(i)
1409 + " type=" + _tokens[i].type
1410 + " utf8=" + (_tokens[i].flag_utf8 ? "y" : "n")
1411 + " punycode=" + (_tokens[i].flag_punycode ? "y" : "n")
1412 + " obsolete=" + (_tokens[i].flag_obsolete ? "y" : "n")
1413 + " offset=" + std::to_string(_tokens[i].offset)
1414 + " length=" + std::to_string(_tokens[i].len)
1415 + " token=" + std::string((char*)_addr.c_str()).substr(_tokens[i].offset, _tokens[i].len)
1416 + " p_token=" + (char*)_tokens[i].p_token.c_str());
1418 // --------------------------------------------------------------------------
1419 // Type-specific characteristics.
1420 // --------------------------------------------------------------------------
1421 switch (_tokens[i].type) {
1423 t.append(std::string( " fqdn=") + (_tokens[i].flag_fqdn ? "y" : "n"));
1426 t.append(std::string( " angle=") + (_tokens[i].flag_angle ? "y" : "n"));
1427 // Fall-through to type "l"
1429 t.append(std::string(" null_addr=") + (_tokens[i].flag_null_addr ? "y" : "n"));
1431 } // -x- switch type -x-
1433 // --------------------------------------------------------------------------
1434 // Final EoL (End of Line) sequence.
1435 // --------------------------------------------------------------------------
1438 } // -x- if filter -x-
1443 } // -x- std::string tokens_to_string -x-
1445 /*======================================================================*//**
1447 Array-style access to eMail addresses. The first element is at index 0.
1451 @returns std::u8string with only the eMail address (no angle brackets, etc.)
1452 as a native UTF-8 string
1453 *///=========================================================================
1454 std::u8string operator[](
1455 /// Index of eMail address to query for (0 = first element; negative index
1456 /// values are calculated in reverse, starting with -1 as the final position)
1458 return _tokens[_index_e[index >= 0 ? index : _index_e.size() + index]].p_token;
1459 } // -x- std::u8string operator[] -x-
1461 /*======================================================================*//**
1463 Support convenient streaming usage with std::cout, std::cerr, and friends.
1464 @returns eMail address in human-readable form
1465 *///=========================================================================
1466 friend std::ostream& operator<< (
1467 /// Output stream (provided automatically by std::cout and std::cerr)
1469 /// Object class (matched by compiler)
1470 rmailaddr const& c) {
1471 return o << (char*)c._addr.c_str();
1472 } // -x- std::ostream& operator<< -x-
1474 }; // -x- class rmailaddr -x-
1476}; // -x- namespace randolf -x-