6#include <stdexcept> // std::invalid_argument
9#include <arpa/inet.h> // Used only for checking for valid IP addresses in domain literals (inet_pton)
13 // --------------------------------------------------------------------------
14 // Constants that list sets of valid characters, which are optimized to test
15 // for ranges of the most commonly-used characters first during parsing, are
16 // named consistently with their respective rule names as defined in RFC2822.
18 // CRLF and \ are invisible in the quoted string according to RFC2822 section
21 // RFC2822 section 3.2.5 also defines a "quoted-string" as containing the
22 // following valid characters (spaces are also permitted): 33, 35-91, 93-126
23 // Certain characters must be quoted first though, and every character
24 // following a backslash is taken literally (and the backslash is removed
27 // RFC2822 section 3.2.4 defines an "atom" as containing the following valid
28 // characters: 0123456789
29 // ABCDEFGHIJKLMNOPQRSTUVWXYZ
30 // abcdefghijklmnopqrstuvwxyz
31 // !#$%&'*+-/=?^_`{|}~
33 // Quote characters and quotation marks are not permitted in the domain part.
35 // According to RFC2822 section 3.2.5, a phrase (DisplayName / Comments) can
36 // be either an atom (ATEXT) or quoted-text (QTEXT).
38 // According to RFC2822 section 2.2.2, whitespace characters are tabs (ASCII
39 // character 9) and spaces (ASCII character 32).
41 // RFC2822 section 3.4 last paragraph indicates that a group construct is
42 // optional, and preceeded by a colon following any number of comma-delimited
43 // recipients (including zero or one). Group constructs must end with a
45 // --------------------------------------------------------------------------
47 // --------------------------------------------------------------------------
48 // The following macros are optimized for performance by testing for the most
49 // commonly-used characters first.
52 // 94...126 ^_`abcdefghijklmnopqrstuvwxyz{|}~
53 // 65...90 ABCDEFGHIJKLMNOPQRSTUVWXYZ
54 // 47...57 /0123456789
59 // --------------------------------------------------------------------------
60 #define ATEXT(a) ((a >= 94 && a <= 126) \
61 || (a >= 65 && a <= 90) \
62 || (a >= 47 && a <= 57) \
63 || a == 45 || a == 33 \
64 || (a >= 35 && a <= 39) \
65 || a == 42 || a == 43 \
66 || a == 61 || a == 63 )
68 // --------------------------------------------------------------------------
69 // ATEXT_OBS ("obsolete standard" is ATEXT plus periods, spaces, and tabs)
70 // 94...126 ^_`abcdefghijklmnopqrstuvwxyz{|}~
71 // 65...90 ABCDEFGHIJKLMNOPQRSTUVWXYZ
72 // 45...57 -./0123456789
73 // 32 | 33 {space:32}!
78 // --------------------------------------------------------------------------
79 #define ATEXT_OBS(a) ((a >= 94 && a <= 126) \
80 || (a >= 65 && a <= 90) \
81 || (a >= 45 && a <= 57) \
82 || a == 32 || a == 33 \
83 || (a >= 35 && a <= 39) \
84 || a == 42 || a == 43 \
85 || a == 61 || a == 63 \
88 // --------------------------------------------------------------------------
89 // CTEXT (comment text)
90 // 93...126 ]^_`abcdefghijklmnopqrstuvwxyz{|}~
91 // 42...91 *+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[
93 // --------------------------------------------------------------------------
94 #define CTEXT(a) ((a >= 93 && a <= 126) \
95 || (a >= 42 && a <= 91) \
96 || (a >= 33 && a <= 39) )
98 // --------------------------------------------------------------------------
99 // CTEXT_WSP (comment text with white space)
100 // 93...126 ]^_`abcdefghijklmnopqrstuvwxyz{|}~
101 // 42...91 *+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[
102 // 32...39 {space:32}!"#$%&'
104 // --------------------------------------------------------------------------
105 #define CTEXT_WSP(a) ((a >= 93 && a <= 126) \
106 || (a >= 42 && a <= 91) \
107 || (a >= 32 && a <= 39) \
110 // --------------------------------------------------------------------------
111 // CTEXT_OBS (obsolete comment text)
115 // --------------------------------------------------------------------------
116 #define CTEXT_OBS(a) (CTEXT(a) \
120 // --------------------------------------------------------------------------
121 // DTEXT (domain-part, not including characters needed for domain-literals)
122 // 94...126 ^_`abcdefghijklmnopqrstuvwxyz{|}~
123 // 33...90 !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ
124 // --------------------------------------------------------------------------
125 #define DTEXT(a) ((a >= 94 && a <= 126) \
126 || (a >= 33 && a <= 90) )
128 // --------------------------------------------------------------------------
129 // FWS (folding white space)
133 // --------------------------------------------------------------------------
134 #define FWS(a) (a == 10 \
138 // --------------------------------------------------------------------------
139 // QTEXT (quoted text)
140 // 93...126 ]^_`abcdefghijklmnopqrstuvwxyz{|}~
141 // 35...91 #$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[
144 // --------------------------------------------------------------------------
145 #define QTEXT(a) ((a >= 93 && a <= 126) \
146 || (a >= 35 && a <= 91) \
147 || a == 32 || a == 33 )
149 // --------------------------------------------------------------------------
151 // 14...127 {char:14-31}{space:32}!"#$%&'()*+,-./0123456789:;<=>
152 // ?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^
153 // _`abcdefghijklmnopqrstuvwxyz{|}~
155 // 1...9 {char:1-6}{beep:7}{backspace:8}{tab:9}
156 // 11 | 12 {char:11}{char:12}
157 // --------------------------------------------------------------------------
158 #define TEXT(a) ((a >= 14 && a <= 127) \
159 || (a >= 1 && a <= 9) \
160 || a == 11 || a == 12 )
162 // --------------------------------------------------------------------------
166 // --------------------------------------------------------------------------
167 #define WSP(a) (a == 32 \
170 // --------------------------------------------------------------------------
171 // Used by the set() method to consistently reset internal variables when
172 // moving onward to the next token.
174 // token_begin: Configures beginning of next token.
175 // --------------------------------------------------------------------------
176 #define RESET_FOR_NEXT_TOKEN \
178 flag_quote = false; \
179 token_begin = offset + 1; \
183 /*======================================================================*//**
185 This @ref rmailaddr class provides an object-oriented eMail address.
189 Some of the key features are:
191 - constructors with sensible defaults help to simplify coding
192 - documentation includes code samples (with @c \#include lines as needed)
193 - can handle ASCIIZ without needing to specify string length
194 - can handle @c std::string (which tracks its own string length)
198 Validation of the format of an eMail address is helpful in ensuring that
199 eMail addresses received from elsewhere comply with internet standards.
203 I created this class to make it easier to write internet server daemons and
204 other software that needs to accept and/or handle eMail addresses. (This is
205 a complete re-write of the version I wrote in Java 17 years ago in 2007,
206 which includes a significant array of differences due to the improved parsing
207 approaches I use now that are more efficient, and the need to make sure that
208 UTF-8 characters and punycode are both handled in a transparent manner.)
212 @author Randolf Richardson
215 2024-May-07 v1.00 Initial version
218 Lower-case letter "m" is regularly used in partial example code to represent
219 an instantiated rmailaddr object.
221 An ASCIIZ string is a C-string (char* array) that includes a terminating null
222 (0) character at the end.
226 I use the term "ASCIIZ string" to indicate an array of characters that's
227 terminated by a 0 (a.k.a., null). Although this is very much the same as a
228 C-string, the difference is that in many API functions a C-string must often
229 be accompanied by its length value. When referring to an ASCIIZ string, I'm
230 intentionally indicating that the length of the string is not needed because
231 the string is null-terminated. (This term was also commonly used in assembly
232 language programming in the 1970s, 1980s, and 1990s, and as far as I know is
233 still used by machine language programmers today.)
238 #include <iostream> // std::cout, std::cerr, std::endl, etc.
239 #include <stdexcept> // std::invalid_argument exception
241 #include <randolf/rmailaddr>
243 int main(int argc, char *argv[]) {
245 randolf::rmailaddr m("nobody@example.com");
246 } catch (const std::invalid_argument e) {
247 std::cerr << "eMail address format exception: " << e.what() << std::endl;
249 } catch (const std::exception e) {
250 std::cerr << "Other exception: " << e.what() << std::endl;
254 } // -x- int main -x-
257 Parameter stacking is supported (with methods that return @c rmailaddr*); in
258 this example, notice that semicolons (";") and "e." references are omittted
259 (when compared with the above):
262 #include <iostream> // std::cout, std::cerr, std::endl, etc.
263 #include <stdexcept> // std::invalid_argument exception
265 #include <randolf/rmailaddr>
267 int main(int argc, char *argv[]) {
269 randolf::rmailaddr m("nobody@example.com");
270 } catch (const std::invalid_argument e) {
271 std::cerr << "eMail address format exception: " << e.what() << std::endl;
273 } catch (const std::exception e) {
274 std::cerr << "Other exception: " << e.what() << std::endl;
278 } // -x- int main -x-
280 *///=========================================================================
284 /*======================================================================*//**
286 Structure of errors (only used when exceptions are disabled).
288 @see policy_throw_exceptions
289 *///=========================================================================
293 /// Offset (0 = position of first byte)
295 }; // -x- struct error_data -x-
298 /*======================================================================*//**
300 Structure of positions within the original eMail string where a portion
301 begins, and its length (in bytes), along with various other information about
304 This is used internally, and std::vector<mail_addr_token> organizes them and
305 looks after freeing memory.
306 *///=========================================================================
307 struct mail_addr_token {
309 /// g = group name (beginning; includes colon)
310 /// ; = group termination (semi-colon character)
312 /// e = eMail address (includes angle brackets, if present)
316 /// \0 = not initialized (null can effectively be regarded as meaning "unknown")
318 /// Offset, within the string, where this part begins
319 unsigned int offset = 0;
320 /// Total number of bytes
321 unsigned int len = 0;
322 /// Whether any UTF-8 characters are present in this part
323 bool flag_utf8 = false;
324 /// Whether this part is in punycode (begins with "xn--")
325 bool flag_punycode = false; // TODO
326 /// Whether this part is "obsolete" (according to RFCs)
327 bool flag_obsolete = false; // TODO
328 /// Whether eMail address was enclosed in angle brackets (type "e" only)
329 bool flag_angle = false;
330 /// Whether the token was enclosed in quotation marks
331 bool flag_quotes = false;
332 /// Whether eMail address is a null address enclosed in angle brackets (type "e" only)
333 bool flag_null_addr = false;
334 /// Whether the domain-part is an FQDN (type "d" only)
335 bool flag_fqdn = false; // TODO
336 /// Whether the domain-part is a domain-literal (type "d" only)
337 bool flag_domain_literal = false; // TODO
338 /// Depth of groups (types "g" and ";" only)
339 unsigned short depth = 0;
340 /// Processed data, with quotation marks, angle brackets, comments, whitespace, etc., removed
341 std::u8string p_token;
342 /// Index to display-name (type "e" only)
343 int index_display_name = -1;
344 /// Index to local-part (type "e" only)
345 int index_local_part = -1;
346 /// Index to domain-part (type "e" only)
347 int index_domain_part = -1;
348 }; // -x- struct mail_addr_token -x-
350 // --------------------------------------------------------------------------
351 // Internal variables.
352 // --------------------------------------------------------------------------
353 std::u8string _addr; // Original eMail address
354 std::vector<mail_addr_token> _tokens; // All eMail address tokens
355 std::vector<int> _index_e; // Index of type "e" records in _tokens
356 std::vector<error_data> _errors; // Error tracking
357 short group_depth = 0; // Recursive group tracking
358 bool angle_bracket_mode = false; // Angle-bracket mode tracking
359 bool quote_mode = false; // Quotation-marks mode tracking
361 // --------------------------------------------------------------------------
363 // --------------------------------------------------------------------------
364 bool _policy_keep_comments = false; // Wether to retain comments embedded in eMail addresses
365 bool _policy_throw_exceptions = true; // TRUE = throw exceptions; FALSE = save internally
366 bool _policy_tabs_to_spaces = false; // Whether to convert every tab into a space
367 bool _policy_support_utf8 = true; // Whether to support UTF-8 (FALSE = 7bit characters only)
369 /*======================================================================*//**
371 *///=========================================================================
375 /// Offset (0 = position of first byte)
377 if (_policy_throw_exceptions) throw std::invalid_argument(message + " at offset " + std::to_string(offset));
378 _errors.push_back({ message, offset });
380 } // -x- void _exception -x-
383 /*======================================================================*//**
385 Instantiate an empty rmailaddr that doesn't qualify as a properly-formatted
386 internet eMail address (because the minimum length of a valid internet eMail
387 address is 1 character).
389 Instantiating an empty rmailaddr is particularly useful for header-file
390 definitions; for example:
392 #include <iostream> // std::cout, std::cerr, std::endl, etc.
393 #include <stdexcept> // std::invalid_argument exception
395 #include <randolf/rmailaddr>
397 randolf::rmailaddr m; // <-- Empty rmailaddr initialization (no exceptions)
399 int main(int argc, char *argv[]) {
401 m.set("nobody@example.com");
402 } catch (const std::invalid_argument e) {
403 std::cerr << "eMail address format exception: " << e.what() << std::endl;
405 } catch (const std::exception e) {
406 std::cerr << "Other exception: " << e.what() << std::endl;
410 } // -x- int main -x-
412 *///=========================================================================
413 rmailaddr() noexcept {}; // -x- constructor rmailaddr -x-
415 /*======================================================================*//**
417 Instantiate an rmailaddr that qualifies as a properly-formatted internet
418 eMail address (if it doesn't qualify, then an exception will be thrown).
422 #include <iostream> // std::cout, std::cerr, std::endl, etc.
423 #include <stdexcept> // std::invalid_argument exception
425 #include <randolf/rmailaddr>
427 int main(int argc, char *argv[]) {
429 randolf::rmailaddr m("nobody@example.com");
430 } catch (const std::invalid_argument e) {
431 std::cerr << "eMail address format exception: " << e.what() << std::endl;
433 } catch (const std::exception e) {
434 std::cerr << "Other exception: " << e.what() << std::endl;
438 } // -x- int main -x-
440 @throws std::invalid_argument describing the problem, along with the byte
441 offset where the problem originated from
443 *///=========================================================================
445 /// RFC-compliant eMail address
446 const char8_t* mailbox,
447 /// Number of characters (-1 = ASCIIZ string)
448 int len = -1) { set(mailbox, len); }; // -x- constructor rmailaddr -x-
450 /*======================================================================*//**
451 @copydoc rmailaddr(const char8_t*, int)
453 *///=========================================================================
455 /// RFC-compliant eMail address
457 /// Number of characters (-1 = ASCIIZ string)
458 int len = -1) { set((char8_t*)mailbox, len); }; // -x- constructor rmailaddr -x-
460 /*======================================================================*//**
461 @copydoc rmailaddr(const char8_t*, int)
463 *///=========================================================================
465 /// RFC-compliant eMail address
466 const std::string mailbox) { set((char8_t*)mailbox.data(), mailbox.size()); }; // -x- constructor rmailaddr -x-
468 /*======================================================================*//**
469 @copydoc rmailaddr(const char8_t*, int)
471 *///=========================================================================
473 /// RFC-compliant eMail address
474 const std::u8string mailbox) { set(mailbox.data(), mailbox.size()); }; // -x- constructor rmailaddr -x-
476 /*======================================================================*//**
478 Access only the eMail address, without display-name, and without any sets of
479 enclosing quotation-marks or enclosing angle-brackets, etc.
485 @throws std::out_of_range if the index is out-of-range
486 @returns std::string with only the eMail address (no display-name, and no
487 enclosing sets of quotation-marks or enclosing angle-brackets, etc.)
488 *///=========================================================================
490 /// Index of eMail address to query for (0 = first element; negative index
491 /// values are calculated in reverse, starting with -1 as the final position)
493// return std::string((char*)_tokens[_index_e[index >= 0 ? index : _index_e.size() + index]].p_token.c_str());
494 return std::string((char*)_tokens.at(_index_e.at(index >= 0 ? index : _index_e.size() + index)).p_token.c_str());
495 }; // -x- std::string addr -x-
497 /*======================================================================*//**
499 Access an eMail address's display-name (the portion preceding the angle
500 brackets). If there were no angle-brackets, then an empty string will
507 @returns std::string with only the display-name (no quotation marks, etc.)
508 *///=========================================================================
509 std::string display_name(
510 /// Index of eMail address to query for (0 = first element; negative index
511 /// values are calculated in reverse, starting with -1 as the final position)
513 return std::string((char*)_tokens[_tokens[_index_e[index >= 0 ? index : _index_e.size() + index]].index_display_name].p_token.c_str());
514 }; // -x- std::string display_name -x-
516 /*======================================================================*//**
518 Access an eMail address's domain-part (the portion following the @c @ sign).
525 @returns std::string with only the domain-part (no angle brackets, etc.)
526 *///=========================================================================
527 std::string domain_part(
528 /// Index of eMail address to query for (0 = first element; negative index
529 /// values are calculated in reverse, starting with -1 as the final position)
531 return std::string((char*)_tokens[_tokens[_index_e[index >= 0 ? index : _index_e.size() + index]].index_domain_part].p_token.c_str());
532 }; // -x- std::string domain_part -x-
534 /*======================================================================*//**
536 Access an eMail address (enclosed in angle-brackets), and preceded by the
537 display-name (if one is available).
540 If the original form of the display-name had a delimiting space before the
541 eMail address, then that space will be present in the result here. If not, a
542 space will not be inserted. (In other words, this aspect of the original
543 full eMail address will be retained.)
549 @returns std::string with display-name and eMail address (in angle-brackets)
550 *///=========================================================================
552 /// Index of eMail address to query for (0 = first element; negative index
553 /// values are calculated in reverse, starting with -1 as the final position)
556 // --------------------------------------------------------------------------
557 // The eMail address has no display-name because it wasn't enclosed in angle
558 // brackets, so present the eMail address on its own, in angle brackets.
559 // --------------------------------------------------------------------------
560 mail_addr_token e = _tokens[_index_e[index >= 0 ? index : _index_e.size() + index]];
561 if (e.index_display_name < 0)
562 return "<" + std::string((char*)e.p_token.c_str()) + ">";
564 // --------------------------------------------------------------------------
565 // There was a display-name, so return the eMail address with display-name
566 // (enclosed in quotation marks if it started out that way).
567 // --------------------------------------------------------------------------
568 mail_addr_token n = _tokens[e.index_display_name];
571 + std::string((char*)n.p_token.c_str())
574 + std::string((char*)e.p_token.c_str())
577 return std::string((char*)n.p_token.c_str())
579 + std::string((char*)e.p_token.c_str())
581 }; // -x- std::string email -x-
583 /*======================================================================*//**
585 Find out if this object doesn't hold any eMail addresses.
590 @returns TRUE = no eMail addresses@n
591 FALSE = one or more eMail addresses
592 *///=========================================================================
593 bool empty() { return _index_e.empty(); }; // -x- bool empty -x-
595 /*======================================================================*//**
597 Return a list of errors that have been collected (instead of throwing
600 @see policy_throw_exceptions
601 *///=========================================================================
602 std::vector<error_data> errors() { return _errors; }; // -x- std::vector<error-data> errors -x-
604 /*======================================================================*//**
606 Clear the list of errors that have been collected (instead of throwing
609 @see policy_throw_exceptions
610 @returns The same rmailaddr object so as to facilitate stacking
611 *///=========================================================================
612 rmailaddr* errors_clear() { _errors.clear(); return this; }; // -x- rmailaddr* errors_clear -x-
614 /*======================================================================*//**
616 Grade an eMail address, similar to traditional elementary school grades. For
617 simplicity, grades "a" through "c" are passes, while grades "d" through "f"
618 are failures, although if less strict then "d" should also qualify as a pass.
622 a = Angle-brackets surrounding eMail address (optional display-name)
623 b = Bare eMail address (no display-name)
624 c = Complex eMail address (groups; optional angle-brackets; optional display-name)
625 d = Defective (because obsolete RFC standards were utilized)
626 e = Errors (only when collecting errors instead of throwing exceptions)
627 f = Failure (an exception was thrown, or eMail address is blank)
630 To test for a pass, use a comparison such as <tt>m.grade() <= 'c'</tt>
631 (strict) or <tt>m.grade() <= 'd'</tt> (not strict).
632 *///=========================================================================
634 /// eMail address index (default is 0 for the first eMail address)
638 }; // -x- char grade -x-
640 /*======================================================================*//**
642 Indicates whether a display-name was included with this eMail address.
643 @returns TRUE = eMail address includes a display-name@n
644 FALSE = eMail address has no display-name
645 *///=========================================================================
646 bool has_display_name(
647 /// eMail address index (default is 0 for the first eMail address)
650 return _tokens[_index_e[index >= 0 ? index : _index_e.size() + index]].index_display_name != -1;
651 }; // -x- bool has_display-name -x-
653 /*======================================================================*//**
655 Find out whether this object holds any number of eMail addresses. If there
656 are no eMail addresses, then this method returns @c FALSE.
661 @returns TRUE = one or more eMail addresses@n
662 FALSE = no eMail addresses
663 *///=========================================================================
664 bool has_any() { return _index_e.size() > 0; }; // -x- bool has_any -x-
666 /*======================================================================*//**
668 Find out whether this object holds multiple eMail addresses. If there is
669 only one eMail address, or no eMail addresses at all, then this method
675 @returns TRUE = two or more eMail addresses@n
676 FALSE = one eMail address@n
677 FALSE = no eMail addresses
678 *///=========================================================================
679 bool has_multiple() { return _index_e.size() > 1; }; // -x- bool has_multiple -x-
681 /*======================================================================*//**
683 Find out whether this object holds exactly one eMail address. If there are
684 two or more eMail addresses, or no eMail addresses, then this method returns
690 @returns TRUE = exactly one eMail address@n
691 FALSE = two or more eMail addresses@n
692 FALSE = no eMail addresses
693 *///=========================================================================
694 bool has_one() { return _index_e.size() == 1; }; // -x- bool has_one -x-
696 /*======================================================================*//**
698 Find out the state of this policy.
699 @see policy_keep_comments
700 @returns policy status
701 *///=========================================================================
702 bool is_policy_keep_comments() { return _policy_keep_comments; }; // -x- bool is_policy_keep_comments -x-
704 /*======================================================================*//**
706 Find out the state of this policy.
707 @see policy_tabs_to_spaces
708 @returns policy status
709 *///=========================================================================
710 bool is_policy_tabs_to_spaces() { return _policy_tabs_to_spaces; }; // -x- bool is_policy_tabs_to_spaces -x-
712 /*======================================================================*//**
714 Find out the state of this policy.
715 @see policy_throw_exceptions
716 @returns policy status
717 *///=========================================================================
718 bool is_policy_throw_exceptions() { return _policy_throw_exceptions; }; // -x- bool is_policy_throw_exceptions -x-
720 /*======================================================================*//**
722 Find out the state of this policy.
723 @see policy_support_utf8
724 @returns policy status
725 *///=========================================================================
726 bool is_policy_support_utf8() { return _policy_support_utf8; }; // -x- bool is_policy_support_utf8 -x-
728 /*======================================================================*//**
730 Indicates whether this is just an eMail address, without any other parts such
731 as display-name, group constructs, comments, etc.
732 @returns TRUE = eMail address includes a display-name@n
733 FALSE = eMail address has no display-name
734 *///=========================================================================
736 /// indicate wither angle-brackets are okay (default is FALSE so that the
737 /// meaning of the word "pure" is not tainted)
738 const bool angle_flag = false,
739 /// eMail address index (default is 0 for the first eMail address)
742//TODO: Finish this (we need to consider groups, display-name, comments, etc.)
743 return _tokens[_index_e[index >= 0 ? index : _index_e.size() + index]].index_display_name != -1;
744 }; // -x- bool is_pure -x-
746 /*======================================================================*//**
748 Access an eMail address's local-part (the portion preceding the @c @ sign).
754 @returns std::string with only the local-part (no angle brackets, etc.)
755 *///=========================================================================
756 std::string local_part(
757 /// Index of eMail address to query for (0 = first element; negative index
758 /// values are calculated in reverse, starting with -1 as the final position)
760 return std::string((char*)_tokens[_tokens[_index_e[index >= 0 ? index : _index_e.size() + index]].index_local_part].p_token.c_str());
761 }; // -x- std::string local_part -x-
763 /*======================================================================*//**
765 Sets the policy for whether to keep comments that were embedded in eMail
766 address group-construct, display-name, and local-part portions.
768 Comments are excluded by default because most systems don't need them, but in
769 the event that they are needed (or wanted), this policy makes it possible to
770 make sure they aren't excluded during processing.
772 @see is_policy_keep_comments
773 @returns The same rmailaddr object so as to facilitate stacking
774 *///=========================================================================
775 rmailaddr* policy_keep_comments(
776 /// FALSE = do not retain comments embedded in eMail addresses (deafult)@n
777 /// TRUE = retain comments embedded in eMail addresses
780 // --------------------------------------------------------------------------
781 // Update internal policy.
782 // --------------------------------------------------------------------------
783 _policy_keep_comments = policy_flag;
785 // --------------------------------------------------------------------------
786 // Return this object to facilitate stacking.
787 // --------------------------------------------------------------------------
790 }; // -x- rmailaddr* policy_keep_comments -x-
792 /*======================================================================*//**
794 Sets the policy for whether to support UTF-8 characters.
796 Some older systems may not be able to handle 8-bit data that UTF-8 utilizes,
797 in which case this policy makes it possible to easily reject incompatible
798 eMail addresses before attempting to use them with such systems.
800 @see is_policy_support_utf8
801 @returns The same rmailaddr object so as to facilitate stacking
802 *///=========================================================================
803 rmailaddr* policy_support_utf8(
804 /// TRUE = support UTF-8 characters in eMail addresses (deafult)@n
805 /// FALSE = do not support UTF-8 characters in eMail addresses
808 // --------------------------------------------------------------------------
809 // Update internal policy.
810 // --------------------------------------------------------------------------
811 _policy_support_utf8 = policy_flag;
813 // --------------------------------------------------------------------------
814 // Return this object to facilitate stacking.
815 // --------------------------------------------------------------------------
818 }; // -x- rmailaddr* policy_support_utf8 -x-
820 /*======================================================================*//**
822 Sets the policy for whether to convert every tab character (ASCII charcter 9)
823 to a space (ASCII character 32). This conversion occurs only once when the
824 eMail address is initially specified in a constructor or by way of one of the
825 @ref set() methods (changing this policy after this point will not be applied
826 to the current eMail address, but it will be in effect for future calls to
827 any of the @ref set() methods).
829 There are some situations where a tab character can create problems, such as
830 when interacting with certain older software or software that makes incorrect
831 assumptions about how to parse an eMail address, and this policy makes it
832 easy to accomodate such situations for the tab character, which some users
833 may be including by using the tab key on their keyboards.
835 @see is_policy_tabs_to_spaces
836 @returns The same rmailaddr object so as to facilitate stacking
837 *///=========================================================================
838 rmailaddr* policy_tabs_to_spaces(
839 /// TRUE = convert every tab character to a space@n
840 /// FALSE = do not convert tab characters to spaces (default)
843 // --------------------------------------------------------------------------
844 // Update internal policy.
845 // --------------------------------------------------------------------------
846 _policy_tabs_to_spaces = policy_flag;
848 // --------------------------------------------------------------------------
849 // Return this object to facilitate stacking.
850 // --------------------------------------------------------------------------
853 }; // -x- rmailaddr* policy_tabs_to_spaces -x-
855 /*======================================================================*//**
857 Sets the policy for whether to throw exceptions when an error is encountered.
859 When this flag is set, errors are tracked internally instead of throwing any
860 exceptions, and will need to be retrieved using the @ref errors() method,
861 which is useful for analyzing an eMail address. (Enabling or disabling this
862 flag does not erase the errors that are stored internally; you will need to
863 use the @ref errors_clear method for this.)
866 This policy is not meant for general use in the majority of applications; it
867 is intended for technical analysis, which would be useful in diagnostic and
868 research applications, or packet analysis applications like WireShark, or for
869 advanced users who are interested in more techincal detail.
872 @see is_policy_throw_exceptions
873 @returns The same rmailaddr object so as to facilitate stacking
874 *///=========================================================================
875 rmailaddr* policy_throw_exceptions(
876 /// TRUE = throw exceptions (default)@n
877 /// FALSE = don't throw exceptions
880 // --------------------------------------------------------------------------
881 // Update internal policy.
882 // --------------------------------------------------------------------------
883 _policy_throw_exceptions = policy_flag;
885 // --------------------------------------------------------------------------
886 // Return this object to facilitate stacking.
887 // --------------------------------------------------------------------------
890 }; // -x- rmailaddr* policy_throw_exceptions -x-
892 /*======================================================================*//**
894 Set a new eMail address, resetting all internal flags, counters, and arrays
895 (but not changing any existing policies). Any existing eMail addresses will
896 be cleared out. (This method is also used internally by most of this class's
898 @throws std::invalid_argument describing the problem, along with the byte
899 offset where the problem originated from
901 @returns The same rmailaddr object so as to facilitate stacking
902 *///=========================================================================
904 /// RFC-compliant eMail address
906 /// Number of characters (-1 = ASCIIZ string)
908 return set((char8_t*)mailbox, len);
909 }; // -x- rmailaddr* set -x-
911 /*======================================================================*//**
912 @copydoc set(const char*, int)
914 @returns The same rmailaddr object so as to facilitate stacking
915 *///=========================================================================
917 /// RFC-compliant eMail address
918 const std::string mailbox) {
919 return set((char8_t*)mailbox.data(), mailbox.size());
920 }; // -x- rmailaddr* set -x-
922 /*======================================================================*//**
923 @copydoc set(const char*, int)
925 @returns The same rmailaddr object so as to facilitate stacking
926 *///=========================================================================
928 /// RFC-compliant eMail address
929 const std::u8string mailbox) {
930 return set(mailbox.data(), mailbox.size());
931 }; // -x- rmailaddr* set -x-
933 /*======================================================================*//**
934 @copydoc set(const char*, int)
936 @returns The same rmailaddr object so as to facilitate stacking
937 *///=========================================================================
939 /// RFC-compliant eMail address
940 const char8_t* mailbox,
941 /// Number of characters (-1 = ASCIIZ string)
944 // --------------------------------------------------------------------------
945 // Measure size of format string if an ASCIIZ string was indicated.
946 // --------------------------------------------------------------------------
947 if (len == -1) len = std::strlen((char*)mailbox);
949 // --------------------------------------------------------------------------
950 // Save a copy of the original eMail address.
951 // --------------------------------------------------------------------------
952 _addr.assign(mailbox, len); // We need to save this for later reference
954 // --------------------------------------------------------------------------
955 // Pre-adjustments (optional, as per policy flags).
956 // --------------------------------------------------------------------------
957 if (_policy_tabs_to_spaces) // Policy: Convert all tabs to spaces
958 _addr.replace(_addr.begin(), _addr.end(), '\t', ' '); // Efficient replacement
960 // --------------------------------------------------------------------------
961 // Internal variables.
962 // --------------------------------------------------------------------------
963 int offset = 0; // Offset within original mailbox char8_t[] array
964 int last_display_name = -1; // Used to build type "e" eMail tokens
965 int last_local_part = -1; // Used to build type "e" eMail tokens
966 int last_domain_part = -1; // Used to build type "e" eMail tokens
968 // --------------------------------------------------------------------------
969 // Internal variables that are reset or updated together at various times,
970 // such as when a token is [in most cases] completed.
971 // --------------------------------------------------------------------------
972 int token_begin = 0; // Beginning offset within current portion of string being parsed
973 char8_t ch; // Character being tested (this needs to be defined outside of the main loop)
974 bool flag_utf8 = false; // UTF8 character(s) detected
975 bool flag_angle = false; // Angle-bracket detected
976 bool flag_quote = false; // Quotation-marks mode detected
977 bool active_angle = false; // Angle-bracket mode is active
978 bool active_at_sign = false; // At-sign mode is active (domain-part instead of local-part interpretation)
979 bool active_quote = false; // Quotation-marks mode is active
980 int comment_depth = 0; // Comments are active when this value is greater than 0 (too many closed comments are in the negative)
981 std::u8string p_token; // Processed token data (angle brackets, quotation marks, comments, and whitespace omitted)
982 std::u8string p_token_sp; // Processed token data (angle brackets, quotation marks, and comments omitted), with spaces preserved
984 // --------------------------------------------------------------------------
985 // Main parsing loop that identifies tokens and ensures compliance, and also
986 // effectively pre-processes eMail addresses on-the-fly for faster access
987 // from the _emails vector later.
988 // --------------------------------------------------------------------------
991 // --------------------------------------------------------------------------
992 // Obtain next character.
993 // --------------------------------------------------------------------------
994 ch = mailbox[offset];
996 // --------------------------------------------------------------------------
997 // Compare one character at a time, but first process special cases of quoted
998 // data (copy most of the data) and comments (ignore the data).
999 // --------------------------------------------------------------------------
1000 if (flag_quote && active_quote && ch != '"') {
1001 if (QTEXT(ch)) { // Include only quoted text
1002 p_token.push_back(ch);
1003 p_token_sp.push_back(ch);
1004 } // -x- if QTEXT -x-
1006 } else if (comment_depth > 0 && ch != ')') { // Ignore all comment data
1007 if (_policy_keep_comments) {
1008 p_token.push_back(ch);
1009 p_token_sp.push_back(ch);
1010 } // -x- if _policy_keep_comments -x-
1013 main_parsing_switch: switch (ch) {
1015 // --------------------------------------------------------------------------
1016 // Group name ends with a colon.
1017 // --------------------------------------------------------------------------
1019 if (!active_quote) { // Enable quotation-marks mode
1020 if (flag_quote) _exception("quotation-marks mode can't be re-opened", offset);
1021 active_quote = true;
1023 } else { // Disable quotation-marks mode
1024 active_quote = false;
1029 // --------------------------------------------------------------------------
1030 // Group name ends with a colon.
1031 // --------------------------------------------------------------------------
1034 // --------------------------------------------------------------------------
1035 // Internal tracking.
1036 // --------------------------------------------------------------------------
1039 // --------------------------------------------------------------------------
1040 // Add this token to the tokens vector.
1041 // --------------------------------------------------------------------------
1042 _tokens.push_back({ .type = 'g',
1043 .offset = token_begin,
1044 .len = offset - token_begin,
1045 .flag_utf8 = flag_utf8,
1046 .p_token = p_token_sp, });
1048 // --------------------------------------------------------------------------
1049 // Reset and prepare internal variables for the next token.
1050 // --------------------------------------------------------------------------
1051 RESET_FOR_NEXT_TOKEN;
1056 // --------------------------------------------------------------------------
1057 // Group of eMail addresses is terminated by a semi-colon.
1058 // --------------------------------------------------------------------------
1061 // --------------------------------------------------------------------------
1062 // Internal tracking.
1063 // --------------------------------------------------------------------------
1064 if (--group_depth < 0) _exception("too many group construct terminators", offset);
1065 if (active_angle) _exception("unbalanced open angle bracket", offset);
1067 // --------------------------------------------------------------------------
1068 // Add this token terminator to the tokens vector.
1069 // --------------------------------------------------------------------------
1070 _tokens.push_back({ .type = ';',
1071 .offset = token_begin,
1072 .len = offset - token_begin,
1073 .flag_utf8 = flag_utf8,
1074 .p_token = p_token_sp, });
1076 // --------------------------------------------------------------------------
1077 // Reset and prepare internal variables for the next token.
1078 // --------------------------------------------------------------------------
1079 RESET_FOR_NEXT_TOKEN;
1084 // --------------------------------------------------------------------------
1085 // Opening angle bracket.
1086 // --------------------------------------------------------------------------
1089 // --------------------------------------------------------------------------
1090 // Internal tracking.
1091 // --------------------------------------------------------------------------
1092 if (flag_angle) _exception("unbalanced open angle bracket", offset);
1093 active_angle = true;
1096 // --------------------------------------------------------------------------
1097 // Add this token terminator to the tokens vector if a display-name exists.
1098 // --------------------------------------------------------------------------
1099 if (token_begin < offset) {
1100 last_display_name = _tokens.size();
1101 _tokens.push_back({ .type = 'n',
1102 .offset = token_begin,
1103 .len = offset - token_begin,
1104 .flag_utf8 = flag_utf8,
1105 .p_token = p_token_sp, });
1106 } // -x- if token_begin -x-
1108 // --------------------------------------------------------------------------
1109 // Reset and prepare internal variables for the next token.
1110 // --------------------------------------------------------------------------
1111 RESET_FOR_NEXT_TOKEN;
1116 // --------------------------------------------------------------------------
1117 // At sign ("@") delimiter.
1118 // --------------------------------------------------------------------------
1121 // --------------------------------------------------------------------------
1122 // Internal tracking.
1123 // --------------------------------------------------------------------------
1124 if (active_at_sign) _exception("too many at (\"@\") signs", offset);
1125 active_at_sign = true;
1127 // --------------------------------------------------------------------------
1128 // Add this token terminator to the tokens vector if a display-name exists.
1129 // --------------------------------------------------------------------------
1130 last_local_part = _tokens.size();
1131 _tokens.push_back({ .type = 'l',
1132 .offset = token_begin,
1133 .len = offset - token_begin,
1134 .flag_utf8 = flag_utf8,
1135 .flag_angle = flag_angle,
1136 .p_token = p_token, });
1138 // --------------------------------------------------------------------------
1139 // Reset and prepare internal variables for the next token.
1140 // --------------------------------------------------------------------------
1141 RESET_FOR_NEXT_TOKEN;
1146 // --------------------------------------------------------------------------
1147 // Closing angle-bracket.
1148 // --------------------------------------------------------------------------
1151 // --------------------------------------------------------------------------
1152 // Internal tracking.
1153 // --------------------------------------------------------------------------
1154 if (!active_angle) _exception("unbalanced closing angle bracket", offset);
1155 active_angle = false;
1156 goto main_parsing_email;
1158 // --------------------------------------------------------------------------
1159 // Reset and prepare internal variables for the next token.
1160 // --------------------------------------------------------------------------
1161 RESET_FOR_NEXT_TOKEN;
1166 // --------------------------------------------------------------------------
1167 // Comma delimiter, signifies the end of an eMail address.
1168 // --------------------------------------------------------------------------
1172 // --------------------------------------------------------------------------
1173 // Internal tracking.
1174 // --------------------------------------------------------------------------
1175 if (active_quote) _exception("unbalanced quotation-marks", offset);
1176 if (active_angle) _exception("unbalanced open angle bracket before comma", offset);
1179 // --------------------------------------------------------------------------
1180 // Add this token terminator to the tokens vector if a display-name exists.
1181 // --------------------------------------------------------------------------
1182 if (active_at_sign) { // Domain-part has been started
1183 last_domain_part = _tokens.size();
1184 _tokens.push_back({ .type = 'd',
1185 .offset = token_begin,
1186 .len = offset - token_begin,
1187 .flag_utf8 = flag_utf8,
1188 .flag_angle = _tokens[last_local_part].flag_angle,
1189 .p_token = p_token, });
1190 active_at_sign = false;
1191 } else { // Domain-part has not been started, so there's only a local-part here
1192 last_local_part = _tokens.size();
1193 _tokens.push_back({ .type = 'l',
1194 .offset = token_begin,
1195 .len = offset - token_begin,
1196 .flag_utf8 = flag_utf8,
1197 .flag_angle = flag_angle,
1198 .p_token = p_token, });
1199 } // -x- if active_at_sign -x-
1201 // --------------------------------------------------------------------------
1202 // Perform a few checks to make sure we're not creating phantom addresses.
1203 // --------------------------------------------------------------------------
1204 int __email_len = last_domain_part == -1 ? _tokens[last_local_part].len : (_tokens[last_domain_part].offset - _tokens[last_local_part].offset) + _tokens[last_domain_part].len;
1205//std::cout << "__email_len=" << std::to_string(__email_len) << std::endl;
1206 if (__email_len == 0 && !flag_angle) continue;
1207//std::cout << "last_local_part=" << std::to_string(last_local_part) << std::endl;
1208//std::cout << "last_domain_part=" << std::to_string(last_domain_part) << std::endl;
1210 // --------------------------------------------------------------------------
1211 // Create a token of type "e" now that this eMail address is closed.
1213 // The reason we're calculating size based on offsets instead of by adding
1214 // sizes together (and adding 1 for the "@" sign) is that commants can be
1215 // included in the localpart portion, which normally won't be counted in any
1217 // --------------------------------------------------------------------------
1218 _index_e.push_back(_tokens.size()); // Add to index of eMail addresses (before adding to _tokens vector, _tokens.size() is the position)
1219 _tokens.push_back({ .type = 'e',
1220 .offset = _tokens[last_local_part].offset,
1221 .len = __email_len,// - token_begin,
1222 .flag_utf8 = _tokens[last_local_part].flag_utf8 || flag_utf8,
1223 .flag_angle = _tokens[last_local_part].flag_angle,
1224 .flag_null_addr = __email_len == 0,
1225 .p_token = _tokens[last_local_part].p_token + ((last_domain_part == -1 || _tokens[last_domain_part].p_token.empty()) ? u8"" : u8"@" + _tokens[last_domain_part].p_token),
1226 .index_display_name = last_display_name,
1227 .index_local_part = last_local_part,
1228 .index_domain_part = last_domain_part, });
1229 last_display_name = -1;
1230 last_local_part = -1;
1231 last_domain_part = -1;
1234 // --------------------------------------------------------------------------
1235 // Reset and prepare internal variables for the next token.
1236 // --------------------------------------------------------------------------
1237 RESET_FOR_NEXT_TOKEN;
1242 // --------------------------------------------------------------------------
1243 // Opening comment parenthesis.
1244 // --------------------------------------------------------------------------
1250 // --------------------------------------------------------------------------
1251 // Closing comment parenthesis.
1252 // --------------------------------------------------------------------------
1254 if (--comment_depth < 0) _exception("unbalanced closing comment parenthesis", offset);
1258 // --------------------------------------------------------------------------
1259 // Backslash (quote-literal).
1260 // --------------------------------------------------------------------------
1263 // --------------------------------------------------------------------------
1264 // Prevent a potential out-of-bounds buffer-overrun problem.
1265 // --------------------------------------------------------------------------
1266 if (++offset == len) {
1267 _exception("unbalanced quote-literal (backslash)", offset);
1268 continue; // Do this in case we're not throwing exceptions
1269 } // -x- if offset -x-
1271 // --------------------------------------------------------------------------
1272 // Update to next character (whatever it is, we're taking it literally).
1273 // --------------------------------------------------------------------------
1274 ch = mailbox[offset];
1275 goto main_parsing_loop_default; // Fall-through to default
1279 // --------------------------------------------------------------------------
1280 // All remaining characters.
1281 // --------------------------------------------------------------------------
1283 //if (flag_angle) _exception("additional data not permitted", offset);
1284 main_parsing_loop_default:
1285 if (ch > 127) { // Include all UTF-8 character (unless prevented by the exception)
1287 if (!_policy_support_utf8) _exception("UTF-8 byte encountered", offset);
1288 p_token.push_back(ch);
1289 p_token_sp.push_back(ch);
1290 } else if (CTEXT(ch) || ' ') { // Include almost everything for now (including spaces)
1291 if (ch != ' ') p_token.push_back(ch); // Exclude spaces
1292 if (!(ch == ' ' && p_token.size() == 0)) p_token_sp.push_back(ch); // Keep spaces
1295 } // -x- switch ch -x-
1297 } while (++offset < len); // -x- do while -x-
1299 // --------------------------------------------------------------------------
1300 // If the final token isn't empty (a.k.a., unfinished / not sealed), then
1301 // figure out what to do and run one more time, or else throw an exception.
1302 // --------------------------------------------------------------------------
1303 if (offset == len && token_begin < offset) {
1304 ch = ','; // Force comma (",") on parsing loop
1305 goto main_parsing_switch;
1306 } else if (offset > len && token_begin < offset) {
1307 _exception("incomplete data", offset - 1);
1308 } // -x- if offset -x-
1311 }; // -x- rmailaddr* set -x-
1313 /*======================================================================*//**
1315 Find out how many eMail addresses this object holds.
1320 @returns The number of eMail addresses
1321 *///=========================================================================
1322 int size() { return _index_e.size(); } // -x- int size -x-
1324 /*======================================================================*//**
1326 Generate a detailed output of all tokens that's useful for debugging.
1330 g = group name (beginning; includes colon)
1331 ; = group termination (semi-colon character)
1333 e = eMail address (includes angle brackets, if present)
1336 c = comment (not implemented)
1337 \0 = not initialized (null; regard as "unknown"; this should never happen)
1340 The difference between "token" and "p_token" is that "token" is the original
1341 and [mostly] unprocessed atom, while "p_token" has been processed with any
1342 sets of angle-brackets, sets of quotation-marks, comments, and whitespace
1343 removed. In nearly all instances, the value of "p_token" is what's needed.
1344 @returns std::string containing multi-line text (one token per line)
1345 *///=========================================================================
1346 std::string tokens_to_string(
1347 /// Filter (string containing characters for those types that are to be
1348 /// included {unrecognized types will be ignored}; the default is no filter)
1349 const std::string filter = "",
1350 /// Prefix (text to insert before the beginning of each line)
1351 const std::string prefix = "",
1352 /// End-of-Line sequence (default is "\n")
1353 const std::string eol = "\n") {
1355 // --------------------------------------------------------------------------
1356 // Internal variables.
1357 // --------------------------------------------------------------------------
1360 // --------------------------------------------------------------------------
1361 // Loop that builds list of tokens (one per line).
1362 // --------------------------------------------------------------------------
1363 for (int i = 0; i < _tokens.size(); i++) {
1365 // --------------------------------------------------------------------------
1367 // --------------------------------------------------------------------------
1368 if (filter.empty() || filter.find(_tokens[i].type) != std::string::npos) {
1370 // --------------------------------------------------------------------------
1371 // Shared characteristics.
1372 // --------------------------------------------------------------------------
1373 t.append(prefix + "index=" + std::to_string(i)
1374 + " type=" + _tokens[i].type
1375 + " utf8=" + (_tokens[i].flag_utf8 ? "y" : "n")
1376 + " punycode=" + (_tokens[i].flag_punycode ? "y" : "n")
1377 + " obsolete=" + (_tokens[i].flag_obsolete ? "y" : "n")
1378 + " offset=" + std::to_string(_tokens[i].offset)
1379 + " length=" + std::to_string(_tokens[i].len)
1380 + " token=" + std::string((char*)_addr.c_str()).substr(_tokens[i].offset, _tokens[i].len)
1381 + " p_token=" + (char*)_tokens[i].p_token.c_str());
1383 // --------------------------------------------------------------------------
1384 // Type-specific characteristics.
1385 // --------------------------------------------------------------------------
1386 switch (_tokens[i].type) {
1388 t.append(std::string( " fqdn=") + (_tokens[i].flag_fqdn ? "y" : "n"));
1391 t.append(std::string( " angle=") + (_tokens[i].flag_angle ? "y" : "n"));
1392 // Fall-through to type "l"
1394 t.append(std::string(" null_addr=") + (_tokens[i].flag_null_addr ? "y" : "n"));
1396 } // -x- switch type -x-
1398 // --------------------------------------------------------------------------
1399 // Final EoL (End of Line) sequence.
1400 // --------------------------------------------------------------------------
1403 } // -x- if filter -x-
1408 }; // -x- std::string tokens_to_string -x-
1410 /*======================================================================*//**
1412 Array-style access to eMail addresses. The first element is at index 0.
1416 @returns std::u8string with only the eMail address (no angle brackets, etc.)
1417 as a native UTF-8 string
1418 *///=========================================================================
1419 std::u8string operator[](
1420 /// Index of eMail address to query for (0 = first element; negative index
1421 /// values are calculated in reverse, starting with -1 as the final position)
1423 return _tokens[_index_e[index >= 0 ? index : _index_e.size() + index]].p_token;
1424 }; // -x- std::u8string operator[] -x-
1426 /*======================================================================*//**
1428 Support convenient streaming usage with std::cout, std::cerr, and friends.
1429 @returns eMail address in human-readable form
1430 *///=========================================================================
1431 friend std::ostream& operator<< (
1432 /// Output stream (provided automatically by std::cout and std::cerr)
1434 /// Object class (matched by compiler)
1435 rmailaddr const& c) { return o << (char*)c._addr.c_str(); }; // -x- std::ostream& operator<< -x-
1437 }; // -x- class rmailaddr -x-
1439}; // -x- namespace randolf -x-