6#include <stdexcept> // std::invalid_argument
9#include <arpa/inet.h> // Used only for checking for valid IP addresses in domain literals (inet_pton)
13 // --------------------------------------------------------------------------
14 // Constants that list sets of valid characters, which are optimized to test
15 // for ranges of the most commonly-used characters first during parsing, are
16 // named consistently with their respective rule names as defined in RFC2822.
18 // CRLF and \ are invisible in the quoted string according to RFC2822 section
21 // RFC2822 section 3.2.5 also defines a "quoted-string" as containing the
22 // following valid characters (spaces are also permitted): 33, 35-91, 93-126
23 // Certain characters must be quoted first though, and every character
24 // following a backslash is taken literally (and the backslash is removed
27 // RFC2822 section 3.2.4 defines an "atom" as containing the following valid
28 // characters: 0123456789
29 // ABCDEFGHIJKLMNOPQRSTUVWXYZ
30 // abcdefghijklmnopqrstuvwxyz
31 // !#$%&'*+-/=?^_`{|}~
33 // Quote characters and quotation marks are not permitted in the domain part.
35 // According to RFC2822 section 3.2.5, a phrase (DisplayName / Comments) can
36 // be either an atom (ATEXT) or quoted-text (QTEXT).
38 // According to RFC2822 section 2.2.2, whitespace characters are tabs (ASCII
39 // character 9) and spaces (ASCII character 32).
41 // RFC2822 section 3.4 last paragraph indicates that a group construct is
42 // optional, and preceeded by a colon following any number of comma-delimited
43 // recipients (including zero or one). Group constructs must end with a
45 // --------------------------------------------------------------------------
47 // --------------------------------------------------------------------------
48 // The following macros are optimized for performance by testing for the most
49 // commonly-used characters first.
52 // 94...126 ^_`abcdefghijklmnopqrstuvwxyz{|}~
53 // 65...90 ABCDEFGHIJKLMNOPQRSTUVWXYZ
54 // 47...57 /0123456789
59 // --------------------------------------------------------------------------
60 #define ATEXT(a) ((a >= 94 && a <= 126) \
61 || (a >= 65 && a <= 90) \
62 || (a >= 47 && a <= 57) \
63 || a == 45 || a == 33 \
64 || (a >= 35 && a <= 39) \
65 || a == 42 || a == 43 \
66 || a == 61 || a == 63 )
68 // --------------------------------------------------------------------------
69 // ATEXT_OBS ("obsolete standard" is ATEXT plus periods, spaces, and tabs)
70 // 94...126 ^_`abcdefghijklmnopqrstuvwxyz{|}~
71 // 65...90 ABCDEFGHIJKLMNOPQRSTUVWXYZ
72 // 45...57 -./0123456789
73 // 32 | 33 {space:32}!
78 // --------------------------------------------------------------------------
79 #define ATEXT_OBS(a) ((a >= 94 && a <= 126) \
80 || (a >= 65 && a <= 90) \
81 || (a >= 45 && a <= 57) \
82 || a == 32 || a == 33 \
83 || (a >= 35 && a <= 39) \
84 || a == 42 || a == 43 \
85 || a == 61 || a == 63 \
88 // --------------------------------------------------------------------------
89 // CTEXT (comment text)
90 // 93...126 ]^_`abcdefghijklmnopqrstuvwxyz{|}~
91 // 42...91 *+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[
93 // --------------------------------------------------------------------------
94 #define CTEXT(a) ((a >= 93 && a <= 126) \
95 || (a >= 42 && a <= 91) \
96 || (a >= 33 && a <= 39) )
98 // --------------------------------------------------------------------------
99 // CTEXT_WSP (comment text with white space)
100 // 93...126 ]^_`abcdefghijklmnopqrstuvwxyz{|}~
101 // 42...91 *+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[
102 // 32...39 {space:32}!"#$%&'
104 // --------------------------------------------------------------------------
105 #define CTEXT_WSP(a) ((a >= 93 && a <= 126) \
106 || (a >= 42 && a <= 91) \
107 || (a >= 32 && a <= 39) \
110 // --------------------------------------------------------------------------
111 // CTEXT_OBS (obsolete comment text)
115 // --------------------------------------------------------------------------
116 #define CTEXT_OBS(a) (CTEXT(a) \
120 // --------------------------------------------------------------------------
121 // DTEXT (domain-part, not including characters needed for domain-literals)
122 // 94...126 ^_`abcdefghijklmnopqrstuvwxyz{|}~
123 // 33...90 !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ
124 // --------------------------------------------------------------------------
125 #define DTEXT(a) ((a >= 94 && a <= 126) \
126 || (a >= 33 && a <= 90) )
128 // --------------------------------------------------------------------------
129 // FWS (folding white space)
133 // --------------------------------------------------------------------------
134 #define FWS(a) (a == 10 \
138 // --------------------------------------------------------------------------
139 // QTEXT (quoted text)
140 // 93...126 ]^_`abcdefghijklmnopqrstuvwxyz{|}~
141 // 35...91 #$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[
144 // --------------------------------------------------------------------------
145 #define QTEXT(a) ((a >= 93 && a <= 126) \
146 || (a >= 35 && a <= 91) \
147 || a == 32 || a == 33 )
149 // --------------------------------------------------------------------------
151 // 14...127 {char:14-31}{space:32}!"#$%&'()*+,-./0123456789:;<=>
152 // ?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^
153 // _`abcdefghijklmnopqrstuvwxyz{|}~
155 // 1...9 {char:1-6}{beep:7}{backspace:8}{tab:9}
156 // 11 | 12 {char:11}{char:12}
157 // --------------------------------------------------------------------------
158 #define TEXT(a) ((a >= 14 && a <= 127) \
159 || (a >= 1 && a <= 9) \
160 || a == 11 || a == 12 )
162 // --------------------------------------------------------------------------
166 // --------------------------------------------------------------------------
167 #define WSP(a) (a == 32 \
170 // --------------------------------------------------------------------------
171 // Used by the set() method to consistently reset internal variables when
172 // moving onward to the next token.
174 // token_begin: Configures beginning of next token.
175 // --------------------------------------------------------------------------
176 #define RESET_FOR_NEXT_TOKEN \
178 flag_quote = false; \
179 token_begin = offset + 1; \
183 /*======================================================================*//**
185 This @ref rmailaddr class provides an object-oriented eMail address.
189 Some of the key features are:
191 - constructors with sensible defaults help to simplify coding
192 - documentation includes code samples (with @c \#include lines as needed)
193 - can handle ASCIIZ without needing to specify string length
194 - can handle @c std::string (which tracks its own string length)
198 Validation of the format of an eMail address is helpful in ensuring that
199 eMail addresses received from elsewhere comply with internet standards.
203 I created this class to make it easier to write internet server daemons and
204 other software that needs to accept and/or handle eMail addresses. (This is
205 a complete re-write of the version I wrote in Java 17 years ago in 2007,
206 which includes a significant array of differences due to the improved parsing
207 approaches I use now that are more efficient, and the need to make sure that
208 UTF-8 characters and punycode are both handled in a transparent manner.)
212 @author Randolf Richardson
215 2024-May-07 v1.00 Initial version
218 Lower-case letter "m" is regularly used in partial example code to represent
219 an instantiated rmailaddr object.
221 An ASCIIZ string is a C-string (char* array) that includes a terminating null
222 (0) character at the end.
226 I use the term "ASCIIZ string" to indicate an array of characters that's
227 terminated by a 0 (a.k.a., null). Although this is very much the same as a
228 C-string, the difference is that in many API functions a C-string must often
229 be accompanied by its length value. When referring to an ASCIIZ string, I'm
230 intentionally indicating that the length of the string is not needed because
231 the string is null-terminated. (This term was also commonly used in assembly
232 language programming in the 1970s, 1980s, and 1990s, and as far as I know is
233 still used by machine language programmers today.)
238 #include <iostream> // std::cout, std::cerr, std::endl, etc.
239 #include <stdexcept> // std::invalid_argument exception
241 #include <randolf/rmailaddr>
243 int main(int argc, char *argv[]) {
245 randolf::rmailaddr m("nobody@example.com");
246 } catch (const std::invalid_argument e) {
247 std::cerr << "eMail address format exception: " << e.what() << std::endl;
249 } catch (const std::exception e) {
250 std::cerr << "Other exception: " << e.what() << std::endl;
254 } // -x- int main -x-
257 Parameter stacking is supported (with methods that return @c rmailaddr*); in
258 this example, notice that semicolons (";") and "e." references are omittted
259 (when compared with the above):
262 #include <iostream> // std::cout, std::cerr, std::endl, etc.
263 #include <stdexcept> // std::invalid_argument exception
265 #include <randolf/rmailaddr>
267 int main(int argc, char *argv[]) {
269 randolf::rmailaddr m("nobody@example.com");
270 } catch (const std::invalid_argument e) {
271 std::cerr << "eMail address format exception: " << e.what() << std::endl;
273 } catch (const std::exception e) {
274 std::cerr << "Other exception: " << e.what() << std::endl;
278 } // -x- int main -x-
280 *///=========================================================================
284 /*======================================================================*//**
286 Structure of errors (only used when exceptions are disabled).
288 @see policy_throw_exceptions
289 *///=========================================================================
293 /// Offset (0 = position of first byte)
295 }; // -x- struct error_data -x-
298 /*======================================================================*//**
300 Structure of positions within the original eMail string where a portion
301 begins, and its length (in bytes), along with various other information about
304 This is used internally, and std::vector<mail_addr_token> organizes them and
305 looks after freeing memory.
306 *///=========================================================================
307 struct mail_addr_token {
309 /// g = group name (beginning; includes colon)
310 /// ; = group termination (semi-colon character)
312 /// e = eMail address (includes angle brackets, if present)
316 /// \0 = not initialized (null can effectively be regarded as meaning "unknown")
318 /// Offset, within the string, where this part begins
319 unsigned int offset = 0;
320 /// Total number of bytes
321 unsigned int len = 0;
322 /// Whether any UTF-8 characters are present in this part
323 bool flag_utf8 = false;
324 /// Whether this part is in punycode (begins with "xn--")
325 bool flag_punycode = false; // TODO
326 /// Whether this part is "obsolete" (according to RFCs)
327 bool flag_obsolete = false; // TODO
328 /// Whether eMail address was enclosed in angle brackets (type "e" only)
329 bool flag_angle = false;
330 /// Whether the token was enclosed in quotation marks
331 bool flag_quotes = false;
332 /// Whether eMail address is a null address enclosed in angle brackets (type "e" only)
333 bool flag_null_addr = false;
334 /// Whether the domain-part is an FQDN (type "d" only)
335 bool flag_fqdn = false; // TODO
336 /// Whether the domain-part is a domain-literal (type "d" only)
337 bool flag_domain_literal = false; // TODO
338 /// Depth of groups (types "g" and ";" only)
339 unsigned short depth = 0;
340 /// Processed data, with quotation marks, angle brackets, comments, whitespace, etc., removed
341 std::u8string p_token;
342 /// Index to display-name (type "e" only)
343 int index_display_name = -1;
344 /// Index to local-part (type "e" only)
345 int index_local_part = -1;
346 /// Index to domain-part (type "e" only)
347 int index_domain_part = -1;
348 }; // -x- struct mail_addr_token -x-
350 // --------------------------------------------------------------------------
351 // Internal variables.
352 // --------------------------------------------------------------------------
353 std::u8string _addr; // Original eMail address
354 std::vector<mail_addr_token> _tokens; // All eMail address tokens
355 std::vector<int> _index_e; // Index of type "e" records in _tokens
356 std::vector<error_data> _errors; // Error tracking
357 short group_depth = 0; // Recursive group tracking
358 bool angle_bracket_mode = false; // Angle-bracket mode tracking
359 bool quote_mode = false; // Quotation-marks mode tracking
361 // --------------------------------------------------------------------------
363 // --------------------------------------------------------------------------
364 bool _policy_keep_comments = false; // Wether to retain comments embedded in eMail addresses
365 bool _policy_throw_exceptions = true; // TRUE = throw exceptions; FALSE = save internally
366 bool _policy_tabs_to_spaces = false; // Whether to convert every tab into a space
367 bool _policy_support_utf8 = true; // Whether to support UTF-8 (FALSE = 7bit characters only)
369 /*======================================================================*//**
371 *///=========================================================================
375 /// Offset (0 = position of first byte)
377 if (_policy_throw_exceptions) throw std::invalid_argument(message + " at offset " + std::to_string(offset));
378 _errors.push_back({ message, offset });
380 } // -x- void _exception -x-
383 /*======================================================================*//**
385 Instantiate an empty rmailaddr that doesn't qualify as a properly-formatted
386 internet eMail address (because the minimum length of a valid internet eMail
387 address is 1 character).
389 Instantiating an empty rmailaddr is particularly useful for header-file
390 definitions; for example:
392 #include <iostream> // std::cout, std::cerr, std::endl, etc.
393 #include <stdexcept> // std::invalid_argument exception
395 #include <randolf/rmailaddr>
397 randolf::rmailaddr m; // <-- Empty rmailaddr initialization (no exceptions)
399 int main(int argc, char *argv[]) {
401 m.set("nobody@example.com");
402 } catch (const std::invalid_argument e) {
403 std::cerr << "eMail address format exception: " << e.what() << std::endl;
405 } catch (const std::exception e) {
406 std::cerr << "Other exception: " << e.what() << std::endl;
410 } // -x- int main -x-
412 *///=========================================================================
413 rmailaddr() noexcept {}; // -x- constructor rmailaddr -x-
415 /*======================================================================*//**
417 Instantiate an rmailaddr that qualifies as a properly-formatted internet
418 eMail address (if it doesn't qualify, then an exception will be thrown).
422 #include <iostream> // std::cout, std::cerr, std::endl, etc.
423 #include <stdexcept> // std::invalid_argument exception
425 #include <randolf/rmailaddr>
427 int main(int argc, char *argv[]) {
429 randolf::rmailaddr m("nobody@example.com");
430 } catch (const std::invalid_argument e) {
431 std::cerr << "eMail address format exception: " << e.what() << std::endl;
433 } catch (const std::exception e) {
434 std::cerr << "Other exception: " << e.what() << std::endl;
438 } // -x- int main -x-
440 @throws std::invalid_argument describing the problem, along with the byte
441 offset where the problem originated from
443 *///=========================================================================
445 /// RFC-compliant eMail address
446 const char8_t* mailbox,
447 /// Number of characters (-1 = ASCIIZ string)
448 int len = -1) { set(mailbox, len); }; // -x- constructor rmailaddr -x-
450 /*======================================================================*//**
451 @copydoc rmailaddr(const char8_t*, int)
453 *///=========================================================================
455 /// RFC-compliant eMail address
457 /// Number of characters (-1 = ASCIIZ string)
458 int len = -1) { set((char8_t*)mailbox, len); }; // -x- constructor rmailaddr -x-
460 /*======================================================================*//**
461 @copydoc rmailaddr(const char8_t*, int)
463 *///=========================================================================
465 /// RFC-compliant eMail address
466 const std::string mailbox) { set((char8_t*)mailbox.data(), mailbox.size()); }; // -x- constructor rmailaddr -x-
468 /*======================================================================*//**
469 @copydoc rmailaddr(const char8_t*, int)
471 *///=========================================================================
473 /// RFC-compliant eMail address
474 const std::u8string mailbox) { set(mailbox.data(), mailbox.size()); }; // -x- constructor rmailaddr -x-
476 /*======================================================================*//**
478 Access only the eMail address, without display-name, and without any sets of
479 enclosing quotation-marks or enclosing angle-brackets, etc.
485 @throws std::out_of_range if the index is out-of-range
486 @returns std::string with only the eMail address (no display-name, and no
487 enclosing sets of quotation-marks or enclosing angle-brackets, etc.)
488 *///=========================================================================
490 /// Index of eMail address to query for (0 = first element; negative index
491 /// values are calculated in reverse, starting with -1 as the final position)
493// return std::string((char*)_tokens[_index_e[index >= 0 ? index : _index_e.size() + index]].p_token.c_str());
494 return std::string((char*)_tokens.at(_index_e.at(index >= 0 ? index : _index_e.size() + index)).p_token.c_str());
495 }; // -x- std::string addr -x-
497 /*======================================================================*//**
499 Access an eMail address's display-name (the portion preceding the angle
500 brackets). If there were no angle-brackets, then an empty string will
507 @returns std::string with only the display-name (no quotation marks, etc.)
508 *///=========================================================================
509 std::string display_name(
510 /// Index of eMail address to query for (0 = first element; negative index
511 /// values are calculated in reverse, starting with -1 as the final position)
513 return std::string((char*)_tokens[_tokens[_index_e[index >= 0 ? index : _index_e.size() + index]].index_display_name].p_token.c_str());
514 }; // -x- std::string display_name -x-
516 /*======================================================================*//**
518 Access an eMail address's domain-part (the portion following the @c @ sign).
525 @returns std::string with only the domain-part (no angle brackets, etc.)
526 *///=========================================================================
527 std::string domain_part(
528 /// Index of eMail address to query for (0 = first element; negative index
529 /// values are calculated in reverse, starting with -1 as the final position)
531 return std::string((char*)_tokens[_tokens[_index_e[index >= 0 ? index : _index_e.size() + index]].index_domain_part].p_token.c_str());
532 }; // -x- std::string domain_part -x-
534 /*======================================================================*//**
536 Access an eMail address (enclosed in angle-brackets), and preceded by the
537 display-name (if one is available).
540 If the original form of the display-name had a delimiting space before the
541 eMail address, then that space will be present in the result here. If not, a
542 space will not be inserted. (In other words, this aspect of the original
543 full eMail address will be retained.)
549 @returns std::string with display-name and eMail address (in angle-brackets)
550 *///=========================================================================
552 /// Index of eMail address to query for (0 = first element; negative index
553 /// values are calculated in reverse, starting with -1 as the final position)
556 // --------------------------------------------------------------------------
557 // The eMail address has no display-name because it wasn't enclosed in angle
558 // brackets, so present the eMail address on its own, in angle brackets.
559 // --------------------------------------------------------------------------
560 mail_addr_token e = _tokens[_index_e[index >= 0 ? index : _index_e.size() + index]];
561 if (e.index_display_name < 0)
562 return "<" + std::string((char*)e.p_token.c_str()) + ">";
564 // --------------------------------------------------------------------------
565 // There was a display-name, so return the eMail address with display-name
566 // (enclosed in quotation marks if it started out that way).
567 // --------------------------------------------------------------------------
568 mail_addr_token n = _tokens[e.index_display_name];
571 + std::string((char*)n.p_token.c_str())
574 + std::string((char*)e.p_token.c_str())
577 return std::string((char*)n.p_token.c_str())
579 + std::string((char*)e.p_token.c_str())
581 }; // -x- std::string email -x-
583 /*======================================================================*//**
585 Find out if this object doesn't hold any eMail addresses.
590 @returns TRUE = no eMail addresses@n
591 FALSE = one or more eMail addresses
592 *///=========================================================================
593 bool empty() { return _index_e.empty(); }; // -x- bool empty -x-
595 /*======================================================================*//**
597 Return a list of errors that have been collected (instead of throwing
600 @see policy_throw_exceptions
601 @returns Vector containing @c error_data
602 *///=========================================================================
603 std::vector<error_data> errors() { return _errors; }; // -x- std::vector<error-data> errors -x-
605 /*======================================================================*//**
607 Clear the list of errors that have been collected (instead of throwing
610 @see policy_throw_exceptions
611 @returns The same rmailaddr object so as to facilitate stacking
612 *///=========================================================================
613 rmailaddr* errors_clear() { _errors.clear(); return this; }; // -x- rmailaddr* errors_clear -x-
615 /*======================================================================*//**
617 Grade an eMail address, similar to traditional elementary school grades. For
618 simplicity, grades "a" through "c" are passes, while grades "d" through "f"
619 are failures, although if less strict then "d" should also qualify as a pass.
623 a = Angle-brackets surrounding eMail address (optional display-name)
624 b = Bare eMail address (no display-name)
625 c = Complex eMail address (groups; optional angle-brackets; optional display-name)
626 d = Defective (because obsolete RFC standards were utilized)
627 e = Errors (only when collecting errors instead of throwing exceptions)
628 f = Failure (an exception was thrown, or eMail address is blank)
631 To test for a pass, use a comparison such as <tt>m.grade() <= 'c'</tt>
632 (strict) or <tt>m.grade() <= 'd'</tt> (not strict).
634 *///=========================================================================
636 /// eMail address index (default is 0 for the first eMail address)
640 }; // -x- char grade -x-
642 /*======================================================================*//**
644 Indicates whether a display-name was included with this eMail address.
645 @returns TRUE = eMail address includes a display-name@n
646 FALSE = eMail address has no display-name
647 *///=========================================================================
648 bool has_display_name(
649 /// eMail address index (default is 0 for the first eMail address)
652 return _tokens[_index_e[index >= 0 ? index : _index_e.size() + index]].index_display_name != -1;
653 }; // -x- bool has_display-name -x-
655 /*======================================================================*//**
657 Find out whether this object holds any number of eMail addresses. If there
658 are no eMail addresses, then this method returns @c FALSE.
663 @returns TRUE = one or more eMail addresses@n
664 FALSE = no eMail addresses
665 *///=========================================================================
666 bool has_any() { return _index_e.size() > 0; }; // -x- bool has_any -x-
668 /*======================================================================*//**
670 Find out whether this object holds multiple eMail addresses. If there is
671 only one eMail address, or no eMail addresses at all, then this method
677 @returns TRUE = two or more eMail addresses@n
678 FALSE = one eMail address@n
679 FALSE = no eMail addresses
680 *///=========================================================================
681 bool has_multiple() { return _index_e.size() > 1; }; // -x- bool has_multiple -x-
683 /*======================================================================*//**
685 Find out whether this object holds exactly one eMail address. If there are
686 two or more eMail addresses, or no eMail addresses, then this method returns
692 @returns TRUE = exactly one eMail address@n
693 FALSE = two or more eMail addresses@n
694 FALSE = no eMail addresses
695 *///=========================================================================
696 bool has_one() { return _index_e.size() == 1; }; // -x- bool has_one -x-
698 /*======================================================================*//**
700 Find out the state of this policy.
701 @see policy_keep_comments
702 @returns policy status
703 *///=========================================================================
704 bool is_policy_keep_comments() { return _policy_keep_comments; }; // -x- bool is_policy_keep_comments -x-
706 /*======================================================================*//**
708 Find out the state of this policy.
709 @see policy_tabs_to_spaces
710 @returns policy status
711 *///=========================================================================
712 bool is_policy_tabs_to_spaces() { return _policy_tabs_to_spaces; }; // -x- bool is_policy_tabs_to_spaces -x-
714 /*======================================================================*//**
716 Find out the state of this policy.
717 @see policy_throw_exceptions
718 @returns policy status
719 *///=========================================================================
720 bool is_policy_throw_exceptions() { return _policy_throw_exceptions; }; // -x- bool is_policy_throw_exceptions -x-
722 /*======================================================================*//**
724 Find out the state of this policy.
725 @see policy_support_utf8
726 @returns policy status
727 *///=========================================================================
728 bool is_policy_support_utf8() { return _policy_support_utf8; }; // -x- bool is_policy_support_utf8 -x-
730 /*======================================================================*//**
732 Indicates whether this is just an eMail address, without any other parts such
733 as display-name, group constructs, comments, etc.
734 @returns TRUE = eMail address includes a display-name@n
735 FALSE = eMail address has no display-name
736 *///=========================================================================
738 /// indicate wither angle-brackets are okay (default is FALSE so that the
739 /// meaning of the word "pure" is not tainted)
740 const bool angle_flag = false,
741 /// eMail address index (default is 0 for the first eMail address)
744//TODO: Finish this (we need to consider groups, display-name, comments, etc.)
745 return _tokens[_index_e[index >= 0 ? index : _index_e.size() + index]].index_display_name != -1;
746 }; // -x- bool is_pure -x-
748 /*======================================================================*//**
750 Access an eMail address's local-part (the portion preceding the @c @ sign).
756 @returns std::string with only the local-part (no angle brackets, etc.)
757 *///=========================================================================
758 std::string local_part(
759 /// Index of eMail address to query for (0 = first element; negative index
760 /// values are calculated in reverse, starting with -1 as the final position)
762 return std::string((char*)_tokens[_tokens[_index_e[index >= 0 ? index : _index_e.size() + index]].index_local_part].p_token.c_str());
763 }; // -x- std::string local_part -x-
765 /*======================================================================*//**
767 Sets the policy for whether to keep comments that were embedded in eMail
768 address group-construct, display-name, and local-part portions.
770 Comments are excluded by default because most systems don't need them, but in
771 the event that they are needed (or wanted), this policy makes it possible to
772 make sure they aren't excluded during processing.
774 @see is_policy_keep_comments
775 @returns The same rmailaddr object so as to facilitate stacking
776 *///=========================================================================
777 rmailaddr* policy_keep_comments(
778 /// FALSE = do not retain comments embedded in eMail addresses (deafult)@n
779 /// TRUE = retain comments embedded in eMail addresses
782 // --------------------------------------------------------------------------
783 // Update internal policy.
784 // --------------------------------------------------------------------------
785 _policy_keep_comments = policy_flag;
787 // --------------------------------------------------------------------------
788 // Return this object to facilitate stacking.
789 // --------------------------------------------------------------------------
792 }; // -x- rmailaddr* policy_keep_comments -x-
794 /*======================================================================*//**
796 Sets the policy for whether to support UTF-8 characters.
798 Some older systems may not be able to handle 8-bit data that UTF-8 utilizes,
799 in which case this policy makes it possible to easily reject incompatible
800 eMail addresses before attempting to use them with such systems.
802 @see is_policy_support_utf8
803 @returns The same rmailaddr object so as to facilitate stacking
804 *///=========================================================================
805 rmailaddr* policy_support_utf8(
806 /// TRUE = support UTF-8 characters in eMail addresses (deafult)@n
807 /// FALSE = do not support UTF-8 characters in eMail addresses
810 // --------------------------------------------------------------------------
811 // Update internal policy.
812 // --------------------------------------------------------------------------
813 _policy_support_utf8 = policy_flag;
815 // --------------------------------------------------------------------------
816 // Return this object to facilitate stacking.
817 // --------------------------------------------------------------------------
820 }; // -x- rmailaddr* policy_support_utf8 -x-
822 /*======================================================================*//**
824 Sets the policy for whether to convert every tab character (ASCII charcter 9)
825 to a space (ASCII character 32). This conversion occurs only once when the
826 eMail address is initially specified in a constructor or by way of one of the
827 @ref set() methods (changing this policy after this point will not be applied
828 to the current eMail address, but it will be in effect for future calls to
829 any of the @ref set() methods).
831 There are some situations where a tab character can create problems, such as
832 when interacting with certain older software or software that makes incorrect
833 assumptions about how to parse an eMail address, and this policy makes it
834 easy to accomodate such situations for the tab character, which some users
835 may be including by using the tab key on their keyboards.
837 @see is_policy_tabs_to_spaces
838 @returns The same rmailaddr object so as to facilitate stacking
839 *///=========================================================================
840 rmailaddr* policy_tabs_to_spaces(
841 /// TRUE = convert every tab character to a space@n
842 /// FALSE = do not convert tab characters to spaces (default)
845 // --------------------------------------------------------------------------
846 // Update internal policy.
847 // --------------------------------------------------------------------------
848 _policy_tabs_to_spaces = policy_flag;
850 // --------------------------------------------------------------------------
851 // Return this object to facilitate stacking.
852 // --------------------------------------------------------------------------
855 }; // -x- rmailaddr* policy_tabs_to_spaces -x-
857 /*======================================================================*//**
859 Sets the policy for whether to throw exceptions when an error is encountered.
861 When this flag is set, errors are tracked internally instead of throwing any
862 exceptions, and will need to be retrieved using the @ref errors() method,
863 which is useful for analyzing an eMail address. (Enabling or disabling this
864 flag does not erase the errors that are stored internally; you will need to
865 use the @ref errors_clear method for this.)
868 This policy is not meant for general use in the majority of applications; it
869 is intended for technical analysis, which would be useful in diagnostic and
870 research applications, or packet analysis applications like WireShark, or for
871 advanced users who are interested in more techincal detail.
874 @see is_policy_throw_exceptions
875 @returns The same rmailaddr object so as to facilitate stacking
876 *///=========================================================================
877 rmailaddr* policy_throw_exceptions(
878 /// TRUE = throw exceptions (default)@n
879 /// FALSE = don't throw exceptions
882 // --------------------------------------------------------------------------
883 // Update internal policy.
884 // --------------------------------------------------------------------------
885 _policy_throw_exceptions = policy_flag;
887 // --------------------------------------------------------------------------
888 // Return this object to facilitate stacking.
889 // --------------------------------------------------------------------------
892 }; // -x- rmailaddr* policy_throw_exceptions -x-
894 /*======================================================================*//**
896 Set a new eMail address, resetting all internal flags, counters, and arrays
897 (but not changing any existing policies). Any existing eMail addresses will
898 be cleared out. (This method is also used internally by most of this class's
900 @throws std::invalid_argument describing the problem, along with the byte
901 offset where the problem originated from
903 @returns The same rmailaddr object so as to facilitate stacking
904 *///=========================================================================
906 /// RFC-compliant eMail address
908 /// Number of characters (-1 = ASCIIZ string)
910 return set((char8_t*)mailbox, len);
911 }; // -x- rmailaddr* set -x-
913 /*======================================================================*//**
914 @copydoc set(const char*, int)
916 @returns The same rmailaddr object so as to facilitate stacking
917 *///=========================================================================
919 /// RFC-compliant eMail address
920 const std::string mailbox) {
921 return set((char8_t*)mailbox.data(), mailbox.size());
922 }; // -x- rmailaddr* set -x-
924 /*======================================================================*//**
925 @copydoc set(const char*, int)
927 @returns The same rmailaddr object so as to facilitate stacking
928 *///=========================================================================
930 /// RFC-compliant eMail address
931 const std::u8string mailbox) {
932 return set(mailbox.data(), mailbox.size());
933 }; // -x- rmailaddr* set -x-
935 /*======================================================================*//**
936 @copydoc set(const char*, int)
938 @returns The same rmailaddr object so as to facilitate stacking
939 *///=========================================================================
941 /// RFC-compliant eMail address
942 const char8_t* mailbox,
943 /// Number of characters (-1 = ASCIIZ string)
946 // --------------------------------------------------------------------------
947 // Measure size of format string if an ASCIIZ string was indicated.
948 // --------------------------------------------------------------------------
949 if (len == -1) len = std::strlen((char*)mailbox);
951 // --------------------------------------------------------------------------
952 // Save a copy of the original eMail address.
953 // --------------------------------------------------------------------------
954 _addr.assign(mailbox, len); // We need to save this for later reference
956 // --------------------------------------------------------------------------
957 // Pre-adjustments (optional, as per policy flags).
958 // --------------------------------------------------------------------------
959 if (_policy_tabs_to_spaces) // Policy: Convert all tabs to spaces
960 _addr.replace(_addr.begin(), _addr.end(), '\t', ' '); // Efficient replacement
962 // --------------------------------------------------------------------------
963 // Internal variables.
964 // --------------------------------------------------------------------------
965 int offset = 0; // Offset within original mailbox char8_t[] array
966 int last_display_name = -1; // Used to build type "e" eMail tokens
967 int last_local_part = -1; // Used to build type "e" eMail tokens
968 int last_domain_part = -1; // Used to build type "e" eMail tokens
970 // --------------------------------------------------------------------------
971 // Internal variables that are reset or updated together at various times,
972 // such as when a token is [in most cases] completed.
973 // --------------------------------------------------------------------------
974 int token_begin = 0; // Beginning offset within current portion of string being parsed
975 char8_t ch; // Character being tested (this needs to be defined outside of the main loop)
976 bool flag_utf8 = false; // UTF8 character(s) detected
977 bool flag_angle = false; // Angle-bracket detected
978 bool flag_quote = false; // Quotation-marks mode detected
979 bool active_angle = false; // Angle-bracket mode is active
980 bool active_at_sign = false; // At-sign mode is active (domain-part instead of local-part interpretation)
981 bool active_quote = false; // Quotation-marks mode is active
982 int comment_depth = 0; // Comments are active when this value is greater than 0 (too many closed comments are in the negative)
983 std::u8string p_token; // Processed token data (angle brackets, quotation marks, comments, and whitespace omitted)
984 std::u8string p_token_sp; // Processed token data (angle brackets, quotation marks, and comments omitted), with spaces preserved
986 // --------------------------------------------------------------------------
987 // Main parsing loop that identifies tokens and ensures compliance, and also
988 // effectively pre-processes eMail addresses on-the-fly for faster access
989 // from the _emails vector later.
990 // --------------------------------------------------------------------------
993 // --------------------------------------------------------------------------
994 // Obtain next character.
995 // --------------------------------------------------------------------------
996 ch = mailbox[offset];
998 // --------------------------------------------------------------------------
999 // Compare one character at a time, but first process special cases of quoted
1000 // data (copy most of the data) and comments (ignore the data).
1001 // --------------------------------------------------------------------------
1002 if (flag_quote && active_quote && ch != '"') {
1003 if (QTEXT(ch)) { // Include only quoted text
1004 p_token.push_back(ch);
1005 p_token_sp.push_back(ch);
1006 } // -x- if QTEXT -x-
1008 } else if (comment_depth > 0 && ch != ')') { // Ignore all comment data
1009 if (_policy_keep_comments) {
1010 p_token.push_back(ch);
1011 p_token_sp.push_back(ch);
1012 } // -x- if _policy_keep_comments -x-
1015 main_parsing_switch: switch (ch) {
1017 // --------------------------------------------------------------------------
1018 // Group name ends with a colon.
1019 // --------------------------------------------------------------------------
1021 if (!active_quote) { // Enable quotation-marks mode
1022 if (flag_quote) _exception("quotation-marks mode can't be re-opened", offset);
1023 active_quote = true;
1025 } else { // Disable quotation-marks mode
1026 active_quote = false;
1031 // --------------------------------------------------------------------------
1032 // Group name ends with a colon.
1033 // --------------------------------------------------------------------------
1036 // --------------------------------------------------------------------------
1037 // Internal tracking.
1038 // --------------------------------------------------------------------------
1041 // --------------------------------------------------------------------------
1042 // Add this token to the tokens vector.
1043 // --------------------------------------------------------------------------
1044 _tokens.push_back({ .type = 'g',
1045 .offset = token_begin,
1046 .len = offset - token_begin,
1047 .flag_utf8 = flag_utf8,
1048 .p_token = p_token_sp, });
1050 // --------------------------------------------------------------------------
1051 // Reset and prepare internal variables for the next token.
1052 // --------------------------------------------------------------------------
1053 RESET_FOR_NEXT_TOKEN;
1058 // --------------------------------------------------------------------------
1059 // Group of eMail addresses is terminated by a semi-colon.
1060 // --------------------------------------------------------------------------
1063 // --------------------------------------------------------------------------
1064 // Internal tracking.
1065 // --------------------------------------------------------------------------
1066 if (--group_depth < 0) _exception("too many group construct terminators", offset);
1067 if (active_angle) _exception("unbalanced open angle bracket", offset);
1069 // --------------------------------------------------------------------------
1070 // Add this token terminator to the tokens vector.
1071 // --------------------------------------------------------------------------
1072 _tokens.push_back({ .type = ';',
1073 .offset = token_begin,
1074 .len = offset - token_begin,
1075 .flag_utf8 = flag_utf8,
1076 .p_token = p_token_sp, });
1078 // --------------------------------------------------------------------------
1079 // Reset and prepare internal variables for the next token.
1080 // --------------------------------------------------------------------------
1081 RESET_FOR_NEXT_TOKEN;
1086 // --------------------------------------------------------------------------
1087 // Opening angle bracket.
1088 // --------------------------------------------------------------------------
1091 // --------------------------------------------------------------------------
1092 // Internal tracking.
1093 // --------------------------------------------------------------------------
1094 if (flag_angle) _exception("unbalanced open angle bracket", offset);
1095 active_angle = true;
1098 // --------------------------------------------------------------------------
1099 // Add this token terminator to the tokens vector if a display-name exists.
1100 // --------------------------------------------------------------------------
1101 if (token_begin < offset) {
1102 last_display_name = _tokens.size();
1103 _tokens.push_back({ .type = 'n',
1104 .offset = token_begin,
1105 .len = offset - token_begin,
1106 .flag_utf8 = flag_utf8,
1107 .p_token = p_token_sp, });
1108 } // -x- if token_begin -x-
1110 // --------------------------------------------------------------------------
1111 // Reset and prepare internal variables for the next token.
1112 // --------------------------------------------------------------------------
1113 RESET_FOR_NEXT_TOKEN;
1118 // --------------------------------------------------------------------------
1119 // At sign ("@") delimiter.
1120 // --------------------------------------------------------------------------
1123 // --------------------------------------------------------------------------
1124 // Internal tracking.
1125 // --------------------------------------------------------------------------
1126 if (active_at_sign) _exception("too many at (\"@\") signs", offset);
1127 active_at_sign = true;
1129 // --------------------------------------------------------------------------
1130 // Add this token terminator to the tokens vector if a display-name exists.
1131 // --------------------------------------------------------------------------
1132 last_local_part = _tokens.size();
1133 _tokens.push_back({ .type = 'l',
1134 .offset = token_begin,
1135 .len = offset - token_begin,
1136 .flag_utf8 = flag_utf8,
1137 .flag_angle = flag_angle,
1138 .p_token = p_token, });
1140 // --------------------------------------------------------------------------
1141 // Reset and prepare internal variables for the next token.
1142 // --------------------------------------------------------------------------
1143 RESET_FOR_NEXT_TOKEN;
1148 // --------------------------------------------------------------------------
1149 // Closing angle-bracket.
1150 // --------------------------------------------------------------------------
1153 // --------------------------------------------------------------------------
1154 // Internal tracking.
1155 // --------------------------------------------------------------------------
1156 if (!active_angle) _exception("unbalanced closing angle bracket", offset);
1157 active_angle = false;
1158 goto main_parsing_email;
1160 // --------------------------------------------------------------------------
1161 // Reset and prepare internal variables for the next token.
1162 // --------------------------------------------------------------------------
1163 RESET_FOR_NEXT_TOKEN;
1168 // --------------------------------------------------------------------------
1169 // Comma delimiter, signifies the end of an eMail address.
1170 // --------------------------------------------------------------------------
1174 // --------------------------------------------------------------------------
1175 // Internal tracking.
1176 // --------------------------------------------------------------------------
1177 if (active_quote) _exception("unbalanced quotation-marks", offset);
1178 if (active_angle) _exception("unbalanced open angle bracket before comma", offset);
1181 // --------------------------------------------------------------------------
1182 // Add this token terminator to the tokens vector if a display-name exists.
1183 // --------------------------------------------------------------------------
1184 if (active_at_sign) { // Domain-part has been started
1185 last_domain_part = _tokens.size();
1186 _tokens.push_back({ .type = 'd',
1187 .offset = token_begin,
1188 .len = offset - token_begin,
1189 .flag_utf8 = flag_utf8,
1190 .flag_angle = _tokens[last_local_part].flag_angle,
1191 .p_token = p_token, });
1192 active_at_sign = false;
1193 } else { // Domain-part has not been started, so there's only a local-part here
1194 last_local_part = _tokens.size();
1195 _tokens.push_back({ .type = 'l',
1196 .offset = token_begin,
1197 .len = offset - token_begin,
1198 .flag_utf8 = flag_utf8,
1199 .flag_angle = flag_angle,
1200 .p_token = p_token, });
1201 } // -x- if active_at_sign -x-
1203 // --------------------------------------------------------------------------
1204 // Perform a few checks to make sure we're not creating phantom addresses.
1205 // --------------------------------------------------------------------------
1206 int __email_len = last_domain_part == -1 ? _tokens[last_local_part].len : (_tokens[last_domain_part].offset - _tokens[last_local_part].offset) + _tokens[last_domain_part].len;
1207//std::cout << "__email_len=" << std::to_string(__email_len) << std::endl;
1208 if (__email_len == 0 && !flag_angle) continue;
1209//std::cout << "last_local_part=" << std::to_string(last_local_part) << std::endl;
1210//std::cout << "last_domain_part=" << std::to_string(last_domain_part) << std::endl;
1212 // --------------------------------------------------------------------------
1213 // Create a token of type "e" now that this eMail address is closed.
1215 // The reason we're calculating size based on offsets instead of by adding
1216 // sizes together (and adding 1 for the "@" sign) is that commants can be
1217 // included in the localpart portion, which normally won't be counted in any
1219 // --------------------------------------------------------------------------
1220 _index_e.push_back(_tokens.size()); // Add to index of eMail addresses (before adding to _tokens vector, _tokens.size() is the position)
1221 _tokens.push_back({ .type = 'e',
1222 .offset = _tokens[last_local_part].offset,
1223 .len = __email_len,// - token_begin,
1224 .flag_utf8 = _tokens[last_local_part].flag_utf8 || flag_utf8,
1225 .flag_angle = _tokens[last_local_part].flag_angle,
1226 .flag_null_addr = __email_len == 0,
1227 .p_token = _tokens[last_local_part].p_token + ((last_domain_part == -1 || _tokens[last_domain_part].p_token.empty()) ? u8"" : u8"@" + _tokens[last_domain_part].p_token),
1228 .index_display_name = last_display_name,
1229 .index_local_part = last_local_part,
1230 .index_domain_part = last_domain_part, });
1231 last_display_name = -1;
1232 last_local_part = -1;
1233 last_domain_part = -1;
1236 // --------------------------------------------------------------------------
1237 // Reset and prepare internal variables for the next token.
1238 // --------------------------------------------------------------------------
1239 RESET_FOR_NEXT_TOKEN;
1244 // --------------------------------------------------------------------------
1245 // Opening comment parenthesis.
1246 // --------------------------------------------------------------------------
1252 // --------------------------------------------------------------------------
1253 // Closing comment parenthesis.
1254 // --------------------------------------------------------------------------
1256 if (--comment_depth < 0) _exception("unbalanced closing comment parenthesis", offset);
1260 // --------------------------------------------------------------------------
1261 // Backslash (quote-literal).
1262 // --------------------------------------------------------------------------
1265 // --------------------------------------------------------------------------
1266 // Prevent a potential out-of-bounds buffer-overrun problem.
1267 // --------------------------------------------------------------------------
1268 if (++offset == len) {
1269 _exception("unbalanced quote-literal (backslash)", offset);
1270 continue; // Do this in case we're not throwing exceptions
1271 } // -x- if offset -x-
1273 // --------------------------------------------------------------------------
1274 // Update to next character (whatever it is, we're taking it literally).
1275 // --------------------------------------------------------------------------
1276 ch = mailbox[offset];
1277 goto main_parsing_loop_default; // Fall-through to default
1281 // --------------------------------------------------------------------------
1282 // All remaining characters.
1283 // --------------------------------------------------------------------------
1285 //if (flag_angle) _exception("additional data not permitted", offset);
1286 main_parsing_loop_default:
1287 if (ch > 127) { // Include all UTF-8 character (unless prevented by the exception)
1289 if (!_policy_support_utf8) _exception("UTF-8 byte encountered", offset);
1290 p_token.push_back(ch);
1291 p_token_sp.push_back(ch);
1292 } else if (CTEXT(ch) || ' ') { // Include almost everything for now (including spaces)
1293 if (ch != ' ') p_token.push_back(ch); // Exclude spaces
1294 if (!(ch == ' ' && p_token.size() == 0)) p_token_sp.push_back(ch); // Keep spaces
1297 } // -x- switch ch -x-
1299 } while (++offset < len); // -x- do while -x-
1301 // --------------------------------------------------------------------------
1302 // If the final token isn't empty (a.k.a., unfinished / not sealed), then
1303 // figure out what to do and run one more time, or else throw an exception.
1304 // --------------------------------------------------------------------------
1305 if (offset == len && token_begin < offset) {
1306 ch = ','; // Force comma (",") on parsing loop
1307 goto main_parsing_switch;
1308 } else if (offset > len && token_begin < offset) {
1309 _exception("incomplete data", offset - 1);
1310 } // -x- if offset -x-
1313 }; // -x- rmailaddr* set -x-
1315 /*======================================================================*//**
1317 Find out how many eMail addresses this object holds.
1322 @returns The number of eMail addresses
1323 *///=========================================================================
1324 int size() { return _index_e.size(); } // -x- int size -x-
1326 /*======================================================================*//**
1328 Generate a detailed output of all tokens that's useful for debugging.
1332 g = group name (beginning; includes colon)
1333 ; = group termination (semi-colon character)
1335 e = eMail address (includes angle brackets, if present)
1338 c = comment (not implemented)
1339 \0 = not initialized (null; regard as "unknown"; this should never happen)
1342 The difference between "token" and "p_token" is that "token" is the original
1343 and [mostly] unprocessed atom, while "p_token" has been processed with any
1344 sets of angle-brackets, sets of quotation-marks, comments, and whitespace
1345 removed. In nearly all instances, the value of "p_token" is what's needed.
1346 @returns std::string containing multi-line text (one token per line)
1347 *///=========================================================================
1348 std::string tokens_to_string(
1349 /// Filter (string containing characters for those types that are to be
1350 /// included {unrecognized types will be ignored}; the default is no filter)
1351 const std::string filter = "",
1352 /// Prefix (text to insert before the beginning of each line)
1353 const std::string prefix = "",
1354 /// End-of-Line sequence (default is "\n")
1355 const std::string eol = "\n") {
1357 // --------------------------------------------------------------------------
1358 // Internal variables.
1359 // --------------------------------------------------------------------------
1362 // --------------------------------------------------------------------------
1363 // Loop that builds list of tokens (one per line).
1364 // --------------------------------------------------------------------------
1365 for (int i = 0; i < _tokens.size(); i++) {
1367 // --------------------------------------------------------------------------
1369 // --------------------------------------------------------------------------
1370 if (filter.empty() || filter.find(_tokens[i].type) != std::string::npos) {
1372 // --------------------------------------------------------------------------
1373 // Shared characteristics.
1374 // --------------------------------------------------------------------------
1375 t.append(prefix + "index=" + std::to_string(i)
1376 + " type=" + _tokens[i].type
1377 + " utf8=" + (_tokens[i].flag_utf8 ? "y" : "n")
1378 + " punycode=" + (_tokens[i].flag_punycode ? "y" : "n")
1379 + " obsolete=" + (_tokens[i].flag_obsolete ? "y" : "n")
1380 + " offset=" + std::to_string(_tokens[i].offset)
1381 + " length=" + std::to_string(_tokens[i].len)
1382 + " token=" + std::string((char*)_addr.c_str()).substr(_tokens[i].offset, _tokens[i].len)
1383 + " p_token=" + (char*)_tokens[i].p_token.c_str());
1385 // --------------------------------------------------------------------------
1386 // Type-specific characteristics.
1387 // --------------------------------------------------------------------------
1388 switch (_tokens[i].type) {
1390 t.append(std::string( " fqdn=") + (_tokens[i].flag_fqdn ? "y" : "n"));
1393 t.append(std::string( " angle=") + (_tokens[i].flag_angle ? "y" : "n"));
1394 // Fall-through to type "l"
1396 t.append(std::string(" null_addr=") + (_tokens[i].flag_null_addr ? "y" : "n"));
1398 } // -x- switch type -x-
1400 // --------------------------------------------------------------------------
1401 // Final EoL (End of Line) sequence.
1402 // --------------------------------------------------------------------------
1405 } // -x- if filter -x-
1410 }; // -x- std::string tokens_to_string -x-
1412 /*======================================================================*//**
1414 Array-style access to eMail addresses. The first element is at index 0.
1418 @returns std::u8string with only the eMail address (no angle brackets, etc.)
1419 as a native UTF-8 string
1420 *///=========================================================================
1421 std::u8string operator[](
1422 /// Index of eMail address to query for (0 = first element; negative index
1423 /// values are calculated in reverse, starting with -1 as the final position)
1425 return _tokens[_index_e[index >= 0 ? index : _index_e.size() + index]].p_token;
1426 }; // -x- std::u8string operator[] -x-
1428 /*======================================================================*//**
1430 Support convenient streaming usage with std::cout, std::cerr, and friends.
1431 @returns eMail address in human-readable form
1432 *///=========================================================================
1433 friend std::ostream& operator<< (
1434 /// Output stream (provided automatically by std::cout and std::cerr)
1436 /// Object class (matched by compiler)
1437 rmailaddr const& c) { return o << (char*)c._addr.c_str(); }; // -x- std::ostream& operator<< -x-
1439 }; // -x- class rmailaddr -x-
1441}; // -x- namespace randolf -x-