randolf.ca  1.00
Randolf Richardson's C++ classes
Loading...
Searching...
No Matches
rmailaddr
1#pragma once
2
3#include <algorithm>
4#include <atomic>
5#include <cstring>
6#include <stdexcept> // std::invalid_argument
7#include <vector>
8
9#include <arpa/inet.h> // Used only for checking for valid IP addresses in domain literals (inet_pton)
10
11namespace randolf {
12
13 // --------------------------------------------------------------------------
14 // Constants that list sets of valid characters, which are optimized to test
15 // for ranges of the most commonly-used characters first during parsing, are
16 // named consistently with their respective rule names as defined in RFC2822.
17 //
18 // CRLF and \ are invisible in the quoted string according to RFC2822 section
19 // 3.2.5.
20 //
21 // RFC2822 section 3.2.5 also defines a "quoted-string" as containing the
22 // following valid characters (spaces are also permitted): 33, 35-91, 93-126
23 // Certain characters must be quoted first though, and every character
24 // following a backslash is taken literally (and the backslash is removed
25 // from the result).
26 //
27 // RFC2822 section 3.2.4 defines an "atom" as containing the following valid
28 // characters: 0123456789
29 // ABCDEFGHIJKLMNOPQRSTUVWXYZ
30 // abcdefghijklmnopqrstuvwxyz
31 // !#$%&'*+-/=?^_`{|}~
32 //
33 // Quote characters and quotation marks are not permitted in the domain part.
34 //
35 // According to RFC2822 section 3.2.5, a phrase (DisplayName / Comments) can
36 // be either an atom (ATEXT) or quoted-text (QTEXT).
37 //
38 // According to RFC2822 section 2.2.2, whitespace characters are tabs (ASCII
39 // character 9) and spaces (ASCII character 32).
40 //
41 // RFC2822 section 3.4 last paragraph indicates that a group construct is
42 // optional, and preceeded by a colon following any number of comma-delimited
43 // recipients (including zero or one). Group constructs must end with a
44 // semi-colon though.
45 // --------------------------------------------------------------------------
46
47 // --------------------------------------------------------------------------
48 // The following macros are optimized for performance by testing for the most
49 // commonly-used characters first.
50 //
51 // ATEXT
52 // 94...126 ^_`abcdefghijklmnopqrstuvwxyz{|}~
53 // 65...90 ABCDEFGHIJKLMNOPQRSTUVWXYZ
54 // 47...57 /0123456789
55 // 45 | 33 -!
56 // 35...39 #$%&'
57 // 42 | 43 *+
58 // 61 | 63 =?
59 // --------------------------------------------------------------------------
60 #define ATEXT(a) ((a >= 94 && a <= 126) \
61 || (a >= 65 && a <= 90) \
62 || (a >= 47 && a <= 57) \
63 || a == 45 || a == 33 \
64 || (a >= 35 && a <= 39) \
65 || a == 42 || a == 43 \
66 || a == 61 || a == 63 )
67
68 // --------------------------------------------------------------------------
69 // ATEXT_OBS ("obsolete standard" is ATEXT plus periods, spaces, and tabs)
70 // 94...126 ^_`abcdefghijklmnopqrstuvwxyz{|}~
71 // 65...90 ABCDEFGHIJKLMNOPQRSTUVWXYZ
72 // 45...57 -./0123456789
73 // 32 | 33 {space:32}!
74 // 35...39 #$%&'
75 // 42 | 43 *+
76 // 61 | 63 =?
77 // 9 {tab:9}
78 // --------------------------------------------------------------------------
79 #define ATEXT_OBS(a) ((a >= 94 && a <= 126) \
80 || (a >= 65 && a <= 90) \
81 || (a >= 45 && a <= 57) \
82 || a == 32 || a == 33 \
83 || (a >= 35 && a <= 39) \
84 || a == 42 || a == 43 \
85 || a == 61 || a == 63 \
86 || a == 9 )
87
88 // --------------------------------------------------------------------------
89 // CTEXT (comment text)
90 // 93...126 ]^_`abcdefghijklmnopqrstuvwxyz{|}~
91 // 42...91 *+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[
92 // 33...39 !"#$%&'
93 // --------------------------------------------------------------------------
94 #define CTEXT(a) ((a >= 93 && a <= 126) \
95 || (a >= 42 && a <= 91) \
96 || (a >= 33 && a <= 39) )
97
98 // --------------------------------------------------------------------------
99 // CTEXT_WSP (comment text with white space)
100 // 93...126 ]^_`abcdefghijklmnopqrstuvwxyz{|}~
101 // 42...91 *+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[
102 // 32...39 {space:32}!"#$%&'
103 // 9 {tab:9}
104 // --------------------------------------------------------------------------
105 #define CTEXT_WSP(a) ((a >= 93 && a <= 126) \
106 || (a >= 42 && a <= 91) \
107 || (a >= 32 && a <= 39) \
108 || a == 9 )
109
110 // --------------------------------------------------------------------------
111 // CTEXT_OBS (obsolete comment text)
112 // CTEXT {CTEXT}
113 // 32 {space:32}
114 // 9 {tab:9}
115 // --------------------------------------------------------------------------
116 #define CTEXT_OBS(a) (CTEXT(a) \
117 || a == 32 \
118 || a == 9 )
119
120 // --------------------------------------------------------------------------
121 // DTEXT (domain-part, not including characters needed for domain-literals)
122 // 94...126 ^_`abcdefghijklmnopqrstuvwxyz{|}~
123 // 33...90 !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ
124 // --------------------------------------------------------------------------
125 #define DTEXT(a) ((a >= 94 && a <= 126) \
126 || (a >= 33 && a <= 90) )
127
128 // --------------------------------------------------------------------------
129 // FWS (folding white space)
130 // 10 {lf:10}
131 // 13 {cr:13}
132 // 9 {tab:9}
133 // --------------------------------------------------------------------------
134 #define FWS(a) (a == 10 \
135 || a == 13 \
136 || a == 9 )
137
138 // --------------------------------------------------------------------------
139 // QTEXT (quoted text)
140 // 93...126 ]^_`abcdefghijklmnopqrstuvwxyz{|}~
141 // 35...91 #$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[
142 // 32 {space:32}
143 // 33 !
144 // --------------------------------------------------------------------------
145 #define QTEXT(a) ((a >= 93 && a <= 126) \
146 || (a >= 35 && a <= 91) \
147 || a == 32 || a == 33 )
148
149 // --------------------------------------------------------------------------
150 // TEXT
151 // 14...127 {char:14-31}{space:32}!"#$%&'()*+,-./0123456789:;<=>
152 // ?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^
153 // _`abcdefghijklmnopqrstuvwxyz{|}~
154 // {delete:127}
155 // 1...9 {char:1-6}{beep:7}{backspace:8}{tab:9}
156 // 11 | 12 {char:11}{char:12}
157 // --------------------------------------------------------------------------
158 #define TEXT(a) ((a >= 14 && a <= 127) \
159 || (a >= 1 && a <= 9) \
160 || a == 11 || a == 12 )
161
162 // --------------------------------------------------------------------------
163 // WSP (white space)
164 // 32 {space:32}
165 // 9 {tab:9}
166 // --------------------------------------------------------------------------
167 #define WSP(a) (a == 32 \
168 || a == 9 )
169
170 // --------------------------------------------------------------------------
171 // Used by the set() method to consistently reset internal variables when
172 // moving onward to the next token.
173 //
174 // token_begin: Configures beginning of next token.
175 // --------------------------------------------------------------------------
176 #define RESET_FOR_NEXT_TOKEN \
177 flag_utf8 = false; \
178 flag_quote = false; \
179 token_begin = offset + 1; \
180 p_token.clear(); \
181 p_token_sp.clear();
182
183 /*======================================================================*//**
184 @brief
185 This @ref rmailaddr class provides an object-oriented eMail address.
186
187 @par Features
188
189 Some of the key features are:
190
191 - constructors with sensible defaults help to simplify coding
192 - documentation includes code samples (with @c \#include lines as needed)
193 - can handle ASCIIZ without needing to specify string length
194 - can handle @c std::string (which tracks its own string length)
195
196 @par Use case
197
198 Validation of the format of an eMail address is helpful in ensuring that
199 eMail addresses received from elsewhere comply with internet standards.
200
201 @par Background
202
203 I created this class to make it easier to write internet server daemons and
204 other software that needs to accept and/or handle eMail addresses. (This is
205 a complete re-write of the version I wrote in Java 17 years ago in 2007,
206 which includes a significant array of differences due to the improved parsing
207 approaches I use now that are more efficient, and the need to make sure that
208 UTF-8 characters and punycode are both handled in a transparent manner.)
209
210 @par Getting started
211
212 @author Randolf Richardson
213 @version 1.00
214 @par History
215 - 2024-May-07 v1.00 Initial version
216 - 2025-Feb-03 v1.00 Increased use of references and pointers
217
218 @par Conventions
219 Lower-case letter "m" is regularly used in partial example code to represent
220 an instantiated rmailaddr object.
221
222 An ASCIIZ string is a C-string (char* array) that includes a terminating null
223 (0) character at the end.
224
225 @par Notes
226
227 I use the term "ASCIIZ string" to indicate an array of characters that's
228 terminated by a 0 (a.k.a., null). Although this is very much the same as a
229 C-string, the difference is that in many API functions a C-string must often
230 be accompanied by its length value. When referring to an ASCIIZ string, I'm
231 intentionally indicating that the length of the string is not needed because
232 the string is null-terminated. (This term was also commonly used in assembly
233 language programming in the 1970s, 1980s, and 1990s, and as far as I know is
234 still used by machine language programmers today.)
235
236 @par Examples
237
238 @code{.cpp}
239 #include <iostream> // std::cout, std::cerr, std::endl, etc.
240 #include <stdexcept> // std::invalid_argument exception
241
242 #include <randolf/rmailaddr>
243
244 int main(int argc, char *argv[]) {
245 try {
246 randolf::rmailaddr m("nobody@example.com");
247 } catch (const std::invalid_argument e) {
248 std::cerr << "eMail address format exception: " << e.what() << std::endl;
249 return EXIT_FAILURE;
250 } catch (const std::exception e) {
251 std::cerr << "Other exception: " << e.what() << std::endl;
252 return EXIT_FAILURE;
253 }
254 return EXIT_SUCCESS;
255 } // -x- int main -x-
256 @endcode
257
258 Parameter stacking is supported (with methods that return @c rmailaddr*); in
259 this example, notice that semicolons (";") and "e." references are omittted
260 (when compared with the above):
261
262 @code{.cpp}
263 #include <iostream> // std::cout, std::cerr, std::endl, etc.
264 #include <stdexcept> // std::invalid_argument exception
265
266 #include <randolf/rmailaddr>
267
268 int main(int argc, char *argv[]) {
269 try {
270 randolf::rmailaddr m("nobody@example.com");
271 } catch (const std::invalid_argument e) {
272 std::cerr << "eMail address format exception: " << e.what() << std::endl;
273 return EXIT_FAILURE;
274 } catch (const std::exception e) {
275 std::cerr << "Other exception: " << e.what() << std::endl;
276 return EXIT_FAILURE;
277 }
278 return EXIT_SUCCESS;
279 } // -x- int main -x-
280 @endcode
281 *///=========================================================================
282 class rmailaddr {
283
284 public:
285 /*======================================================================*//**
286 @brief
287 Structure of errors (only used when exceptions are disabled).
288 @see errors
289 @see policy_throw_exceptions
290 *///=========================================================================
291 struct error_data {
292 /// Error message
293 std::string message;
294 /// Offset (0 = position of first byte)
295 unsigned int offset;
296 }; // -x- struct error_data -x-
297
298 private:
299 /*======================================================================*//**
300 @brief
301 Structure of positions within the original eMail string where a portion
302 begins, and its length (in bytes), along with various other information about
303 the section.
304
305 This is used internally, and std::vector<mail_addr_token> organizes them and
306 looks after freeing memory.
307 *///=========================================================================
308 struct mail_addr_token {
309 /// Types:
310 /// g = group name (beginning; includes colon)
311 /// ; = group termination (semi-colon character)
312 /// n = display name
313 /// e = eMail address (includes angle brackets, if present)
314 /// l = local-part
315 /// d = domain-part
316 /// c = comment
317 /// \0 = not initialized (null can effectively be regarded as meaning "unknown")
318 char type = (char)0;
319 /// Offset, within the string, where this part begins
320 unsigned int offset = 0;
321 /// Total number of bytes
322 unsigned int len = 0;
323 /// Whether any UTF-8 characters are present in this part
324 bool flag_utf8 = false;
325 /// Whether this part is in punycode (begins with "xn--")
326 bool flag_punycode = false; // TODO
327 /// Whether this part is "obsolete" (according to RFCs)
328 bool flag_obsolete = false; // TODO
329 /// Whether eMail address was enclosed in angle brackets (type "e" only)
330 bool flag_angle = false;
331 /// Whether the token was enclosed in quotation marks
332 bool flag_quotes = false;
333 /// Whether eMail address is a null address enclosed in angle brackets (type "e" only)
334 bool flag_null_addr = false;
335 /// Whether the domain-part is an FQDN (type "d" only)
336 bool flag_fqdn = false; // TODO
337 /// Whether the domain-part is a domain-literal (type "d" only)
338 bool flag_domain_literal = false; // TODO
339 /// Depth of groups (types "g" and ";" only)
340 unsigned short depth = 0;
341 /// Processed data, with quotation marks, angle brackets, comments, whitespace, etc., removed
342 std::u8string p_token;
343 /// Index to display-name (type "e" only)
344 int index_display_name = -1;
345 /// Index to local-part (type "e" only)
346 int index_local_part = -1;
347 /// Index to domain-part (type "e" only)
348 int index_domain_part = -1;
349 }; // -x- struct mail_addr_token -x-
350
351 // --------------------------------------------------------------------------
352 // Internal variables.
353 // --------------------------------------------------------------------------
354 std::u8string _addr; // Original eMail address
355 std::vector<mail_addr_token> _tokens; // All eMail address tokens
356 std::vector<int> _index_e; // Index of type "e" records in _tokens
357 std::vector<error_data> _errors; // Error tracking
358 short group_depth = 0; // Recursive group tracking
359 bool angle_bracket_mode = false; // Angle-bracket mode tracking
360 bool quote_mode = false; // Quotation-marks mode tracking
361
362 // --------------------------------------------------------------------------
363 // Policy variables.
364 // --------------------------------------------------------------------------
365 bool _policy_keep_comments = false; // Wether to retain comments embedded in eMail addresses
366 bool _policy_throw_exceptions = true; // TRUE = throw exceptions; FALSE = save internally
367 bool _policy_tabs_to_spaces = false; // Whether to convert every tab into a space
368 bool _policy_support_utf8 = true; // Whether to support UTF-8 (FALSE = 7bit characters only)
369
370 /*======================================================================*//**
371 Exception handler.
372 *///=========================================================================
373 void _exception(
374 /// Error message
375 const std::string& message,
376 /// Offset (0 = position of first byte)
377 const int offset) {
378 if (_policy_throw_exceptions) throw std::invalid_argument(message + " at offset " + std::to_string(offset));
379 _errors.push_back({ message, offset });
380 return;
381 } // -x- void _exception -x-
382
383 public:
384 /*======================================================================*//**
385 @brief
386 Instantiate an empty rmailaddr that doesn't qualify as a properly-formatted
387 internet eMail address (because the minimum length of a valid internet eMail
388 address is 1 character).
389
390 Instantiating an empty rmailaddr is particularly useful for header-file
391 definitions; for example:
392 @code{.cpp}
393 #include <iostream> // std::cout, std::cerr, std::endl, etc.
394 #include <stdexcept> // std::invalid_argument exception
395
396 #include <randolf/rmailaddr>
397
398 randolf::rmailaddr m; // <-- Empty rmailaddr initialization (no exceptions)
399
400 int main(int argc, char *argv[]) {
401 try {
402 m.set("nobody@example.com");
403 } catch (const std::invalid_argument e) {
404 std::cerr << "eMail address format exception: " << e.what() << std::endl;
405 return EXIT_FAILURE;
406 } catch (const std::exception e) {
407 std::cerr << "Other exception: " << e.what() << std::endl;
408 return EXIT_FAILURE;
409 }
410 return EXIT_SUCCESS;
411 } // -x- int main -x-
412 @endcode
413 *///=========================================================================
414 rmailaddr() noexcept {} // -x- constructor rmailaddr -x-
415
416 /*======================================================================*//**
417 @brief
418 Instantiate an rmailaddr that qualifies as a properly-formatted internet
419 eMail address (if it doesn't qualify, then an exception will be thrown).
420
421 Usage example:
422 @code{.cpp}
423 #include <iostream> // std::cout, std::cerr, std::endl, etc.
424 #include <stdexcept> // std::invalid_argument exception
425
426 #include <randolf/rmailaddr>
427
428 int main(int argc, char *argv[]) {
429 try {
430 randolf::rmailaddr m("nobody@example.com");
431 } catch (const std::invalid_argument e) {
432 std::cerr << "eMail address format exception: " << e.what() << std::endl;
433 return EXIT_FAILURE;
434 } catch (const std::exception e) {
435 std::cerr << "Other exception: " << e.what() << std::endl;
436 return EXIT_FAILURE;
437 }
438 return EXIT_SUCCESS;
439 } // -x- int main -x-
440 @endcode
441 @throws std::invalid_argument describing the problem, along with the byte
442 offset where the problem originated from
443 @see rmailaddr
444 *///=========================================================================
445 rmailaddr(
446 /// RFC-compliant eMail address
447 const char8_t* mailbox,
448 /// Number of characters (-1 = ASCIIZ string)
449 const int len = -1) {
450 set(mailbox, len);
451 } // -x- constructor rmailaddr -x-
452
453 /*======================================================================*//**
454 @copydoc rmailaddr(const char8_t*, int)
455 @see rmailaddr
456 *///=========================================================================
457 rmailaddr(
458 /// RFC-compliant eMail address
459 const char* mailbox,
460 /// Number of characters (-1 = ASCIIZ string)
461 const int len = -1) {
462 set((char8_t*)mailbox, len);
463 } // -x- constructor rmailaddr -x-
464
465 /*======================================================================*//**
466 @copydoc rmailaddr(const char8_t*, int)
467 @see rmailaddr
468 *///=========================================================================
469 rmailaddr(
470 /// RFC-compliant eMail address
471 const std::string& mailbox) {
472 set((char8_t*)mailbox.data(), mailbox.size());
473 } // -x- constructor rmailaddr -x-
474
475 /*======================================================================*//**
476 @copydoc rmailaddr(const char8_t*, int)
477 @see rmailaddr
478 *///=========================================================================
479 rmailaddr(
480 /// RFC-compliant eMail address
481 const std::u8string& mailbox) {
482 set(mailbox.data(), mailbox.size());
483 } // -x- constructor rmailaddr -x-
484
485 /*======================================================================*//**
486 @brief
487 Access only the eMail address, without display-name, and without any sets of
488 enclosing quotation-marks or enclosing angle-brackets, etc.
489 @see display_name
490 @see domain_part
491 @see email
492 @see local_part
493 @see operator[](int)
494 @throws std::out_of_range if the index is out-of-range
495 @returns std::string with only the eMail address (no display-name, and no
496 enclosing sets of quotation-marks or enclosing angle-brackets, etc.)
497 *///=========================================================================
498 std::string addr(
499 /// Index of eMail address to query for (0 = first element; negative index
500 /// values are calculated in reverse, starting with -1 as the final position)
501 const int index = 0) {
502// return std::string((char*)_tokens[_index_e[index >= 0 ? index : _index_e.size() + index]].p_token.c_str());
503 return std::string((char*)_tokens.at(_index_e.at(index >= 0 ? index : _index_e.size() + index)).p_token.c_str());
504 } // -x- std::string addr -x-
505
506 /*======================================================================*//**
507 @brief
508 Access an eMail address's display-name (the portion preceding the angle
509 brackets).&nbsp; If there were no angle-brackets, then an empty string will
510 be returned.
511 @see addr
512 @see domain_part
513 @see email
514 @see local_part
515 @see operator[](int)
516 @returns std::string with only the display-name (no quotation marks, etc.)
517 *///=========================================================================
518 std::string display_name(
519 /// Index of eMail address to query for (0 = first element; negative index
520 /// values are calculated in reverse, starting with -1 as the final position)
521 const int index = 0) {
522 return std::string((char*)_tokens[_tokens[_index_e[index >= 0 ? index : _index_e.size() + index]].index_display_name].p_token.c_str());
523 } // -x- std::string display_name -x-
524
525 /*======================================================================*//**
526 @brief
527 Access an eMail address's domain-part (the portion following the @c @ sign).
528 @see get
529 @see addr
530 @see display_name
531 @see email
532 @see local_part
533 @see operator[](int)
534 @returns std::string with only the domain-part (no angle brackets, etc.)
535 *///=========================================================================
536 std::string domain_part(
537 /// Index of eMail address to query for (0 = first element; negative index
538 /// values are calculated in reverse, starting with -1 as the final position)
539 int index = 0) {
540 return std::string((char*)_tokens[_tokens[_index_e[index >= 0 ? index : _index_e.size() + index]].index_domain_part].p_token.c_str());
541 } // -x- std::string domain_part -x-
542
543 /*======================================================================*//**
544 @brief
545 Access an eMail address (enclosed in angle-brackets), and preceded by the
546 display-name (if one is available).
547
548 @note
549 If the original form of the display-name had a delimiting space before the
550 eMail address, then that space will be present in the result here. If not, a
551 space will not be inserted. (In other words, this aspect of the original
552 full eMail address will be retained.)
553 @see addr
554 @see display_name
555 @see domain_part
556 @see local_part
557 @see operator[](int)
558 @returns std::string with display-name and eMail address (in angle-brackets)
559 *///=========================================================================
560 std::string email(
561 /// Index of eMail address to query for (0 = first element; negative index
562 /// values are calculated in reverse, starting with -1 as the final position)
563 int index = 0) {
564
565 // --------------------------------------------------------------------------
566 // The eMail address has no display-name because it wasn't enclosed in angle
567 // brackets, so present the eMail address on its own, in angle brackets.
568 // --------------------------------------------------------------------------
569 mail_addr_token e = _tokens[_index_e[index >= 0 ? index : _index_e.size() + index]];
570 if (e.index_display_name < 0)
571 return "<" + std::string((char*)e.p_token.c_str()) + ">";
572
573 // --------------------------------------------------------------------------
574 // There was a display-name, so return the eMail address with display-name
575 // (enclosed in quotation marks if it started out that way).
576 // --------------------------------------------------------------------------
577 mail_addr_token n = _tokens[e.index_display_name];
578 if (n.flag_quotes)
579 return "\""
580 + std::string((char*)n.p_token.c_str())
581 + "\""
582 + "<"
583 + std::string((char*)e.p_token.c_str())
584 + ">";
585
586 return std::string((char*)n.p_token.c_str())
587 + "<"
588 + std::string((char*)e.p_token.c_str())
589 + ">";
590
591 } // -x- std::string email -x-
592
593 /*======================================================================*//**
594 @brief
595 Find out if this object doesn't hold any eMail addresses.
596 @see has_any
597 @see has_multiple
598 @see has_one
599 @see size
600 @returns TRUE = no eMail addresses@n
601 FALSE = one or more eMail addresses
602 *///=========================================================================
603 bool empty() {
604 return _index_e.empty();
605 } // -x- bool empty -x-
606
607 /*======================================================================*//**
608 @brief
609 Return a list of errors that have been collected (instead of throwing
610 exceptions).
611 @see errors_clear
612 @see policy_throw_exceptions
613 @returns Vector containing @c error_data
614 *///=========================================================================
615 std::vector<error_data> errors() {
616 return _errors;
617 } // -x- std::vector<error-data> errors -x-
618
619 /*======================================================================*//**
620 @brief
621 Clear the list of errors that have been collected (instead of throwing
622 exceptions).
623 @see errors
624 @see policy_throw_exceptions
625 @returns The same rmailaddr object so as to facilitate stacking
626 *///=========================================================================
627 rmailaddr& errors_clear() {
628 _errors.clear();
629 return *this;
630 } // -x- rmailaddr& errors_clear -x-
631
632 /*======================================================================*//**
633 @brief
634 Grade an eMail address, similar to traditional elementary school grades. For
635 simplicity, grades "a" through "c" are passes, while grades "d" through "f"
636 are failures, although if less strict then "d" should also qualify as a pass.
637
638 @code
639 Ratings:
640 a = Angle-brackets surrounding eMail address (optional display-name)
641 b = Bare eMail address (no display-name)
642 c = Complex eMail address (groups; optional angle-brackets; optional display-name)
643 d = Defective (because obsolete RFC standards were utilized)
644 e = Errors (only when collecting errors instead of throwing exceptions)
645 f = Failure (an exception was thrown, or eMail address is blank)
646 @endcode
647
648 To test for a pass, use a comparison such as <tt>m.grade() <= 'c'</tt>
649 (strict) or <tt>m.grade() <= 'd'</tt> (not strict).
650 @returns Rating code
651 *///=========================================================================
652 char grade(
653 /// eMail address index (default is 0 for the first eMail address)
654 const int index = 0) {
655 return 'f';
656 } // -x- char grade -x-
657
658 /*======================================================================*//**
659 @brief
660 Indicates whether a display-name was included with this eMail address.
661 @returns TRUE = eMail address includes a display-name@n
662 FALSE = eMail address has no display-name
663 *///=========================================================================
664 bool has_display_name(
665 /// eMail address index (default is 0 for the first eMail address)
666 const int index = 0) {
667 return _tokens[_index_e[index >= 0 ? index : _index_e.size() + index]].index_display_name != -1;
668 } // -x- bool has_display-name -x-
669
670 /*======================================================================*//**
671 @brief
672 Find out whether this object holds any number of eMail addresses. If there
673 are no eMail addresses, then this method returns @c FALSE.
674 @see empty
675 @see has_multiple
676 @see has_one
677 @see size
678 @returns TRUE = one or more eMail addresses@n
679 FALSE = no eMail addresses
680 *///=========================================================================
681 bool has_any() {
682 return _index_e.size() > 0;
683 } // -x- bool has_any -x-
684
685 /*======================================================================*//**
686 @brief
687 Find out whether this object holds multiple eMail addresses. If there is
688 only one eMail address, or no eMail addresses at all, then this method
689 returns @c FALSE.
690 @see empty
691 @see has_any
692 @see has_one
693 @see size
694 @returns TRUE = two or more eMail addresses@n
695 FALSE = one eMail address@n
696 FALSE = no eMail addresses
697 *///=========================================================================
698 bool has_multiple() {
699 return _index_e.size() > 1;
700 } // -x- bool has_multiple -x-
701
702 /*======================================================================*//**
703 @brief
704 Find out whether this object holds exactly one eMail address. If there are
705 two or more eMail addresses, or no eMail addresses, then this method returns
706 @c FALSE.
707 @see empty
708 @see has_any
709 @see has_multiple
710 @see size
711 @returns TRUE = exactly one eMail address@n
712 FALSE = two or more eMail addresses@n
713 FALSE = no eMail addresses
714 *///=========================================================================
715 bool has_one() {
716 return _index_e.size() == 1;
717 } // -x- bool has_one -x-
718
719 /*======================================================================*//**
720 @brief
721 Find out the state of this policy.
722 @see policy_keep_comments
723 @returns policy status
724 *///=========================================================================
725 bool is_policy_keep_comments() {
726 return _policy_keep_comments;
727 } // -x- bool is_policy_keep_comments -x-
728
729 /*======================================================================*//**
730 @brief
731 Find out the state of this policy.
732 @see policy_tabs_to_spaces
733 @returns policy status
734 *///=========================================================================
735 bool is_policy_tabs_to_spaces() {
736 return _policy_tabs_to_spaces;
737 } // -x- bool is_policy_tabs_to_spaces -x-
738
739 /*======================================================================*//**
740 @brief
741 Find out the state of this policy.
742 @see policy_throw_exceptions
743 @returns policy status
744 *///=========================================================================
745 bool is_policy_throw_exceptions() {
746 return _policy_throw_exceptions;
747 } // -x- bool is_policy_throw_exceptions -x-
748
749 /*======================================================================*//**
750 @brief
751 Find out the state of this policy.
752 @see policy_support_utf8
753 @returns policy status
754 *///=========================================================================
755 bool is_policy_support_utf8() {
756 return _policy_support_utf8;
757 } // -x- bool is_policy_support_utf8 -x-
758
759 /*======================================================================*//**
760 @brief
761 Indicates whether this is just an eMail address, without any other parts such
762 as display-name, group constructs, comments, etc.
763 @returns TRUE = eMail address includes a display-name@n
764 FALSE = eMail address has no display-name
765 *///=========================================================================
766 bool is_pure(
767 /// indicate whether angle-brackets are okay (default is FALSE so that the
768 /// meaning of the word "pure" is not tainted)
769 const bool angle_flag = false,
770 /// eMail address index (default is 0 for the first eMail address)
771 const int index = 0) {
772//TODO: Finish this (we need to consider groups, display-name, comments, etc.)
773 return _tokens[_index_e[index >= 0 ? index : _index_e.size() + index]].index_display_name != -1;
774 } // -x- bool is_pure -x-
775
776 /*======================================================================*//**
777 @brief
778 Access an eMail address's local-part (the portion preceding the @c @ sign).
779 @see addr
780 @see display_name
781 @see domain_part
782 @see email
783 @see operator[](int)
784 @returns std::string with only the local-part (no angle brackets, etc.)
785 *///=========================================================================
786 std::string local_part(
787 /// Index of eMail address to query for (0 = first element; negative index
788 /// values are calculated in reverse, starting with -1 as the final position)
789 int index = 0) {
790 return std::string((char*)_tokens[_tokens[_index_e[index >= 0 ? index : _index_e.size() + index]].index_local_part].p_token.c_str());
791 } // -x- std::string local_part -x-
792
793 /*======================================================================*//**
794 @brief
795 Sets the policy for whether to keep comments that were embedded in eMail
796 address group-construct, display-name, and local-part portions.
797
798 Comments are excluded by default because most systems don't need them, but in
799 the event that they are needed (or wanted), this policy makes it possible to
800 make sure they aren't excluded during processing.
801 @see set
802 @see is_policy_keep_comments
803 @returns The same rmailaddr object so as to facilitate stacking
804 *///=========================================================================
805 rmailaddr& policy_keep_comments(
806 /// FALSE = do not retain comments embedded in eMail addresses (deafult)@n
807 /// TRUE = retain comments embedded in eMail addresses
808 bool policy_flag) {
809
810 // --------------------------------------------------------------------------
811 // Update internal policy.
812 // --------------------------------------------------------------------------
813 _policy_keep_comments = policy_flag;
814
815 // --------------------------------------------------------------------------
816 // Return this object to facilitate stacking.
817 // --------------------------------------------------------------------------
818 return *this;
819
820 } // -x- rmailaddr& policy_keep_comments -x-
821
822 /*======================================================================*//**
823 @brief
824 Sets the policy for whether to support UTF-8 characters.
825
826 Some older systems may not be able to handle 8-bit data that UTF-8 utilizes,
827 in which case this policy makes it possible to easily reject incompatible
828 eMail addresses before attempting to use them with such systems.
829 @see set
830 @see is_policy_support_utf8
831 @returns The same rmailaddr object so as to facilitate stacking
832 *///=========================================================================
833 rmailaddr& policy_support_utf8(
834 /// TRUE = support UTF-8 characters in eMail addresses (deafult)@n
835 /// FALSE = do not support UTF-8 characters in eMail addresses
836 bool policy_flag) {
837
838 // --------------------------------------------------------------------------
839 // Update internal policy.
840 // --------------------------------------------------------------------------
841 _policy_support_utf8 = policy_flag;
842
843 // --------------------------------------------------------------------------
844 // Return this object to facilitate stacking.
845 // --------------------------------------------------------------------------
846 return *this;
847
848 } // -x- rmailaddr& policy_support_utf8 -x-
849
850 /*======================================================================*//**
851 @brief
852 Sets the policy for whether to convert every tab character (ASCII charcter 9)
853 to a space (ASCII character 32). This conversion occurs only once when the
854 eMail address is initially specified in a constructor or by way of one of the
855 @ref set() methods (changing this policy after this point will not be applied
856 to the current eMail address, but it will be in effect for future calls to
857 any of the @ref set() methods).
858
859 There are some situations where a tab character can create problems, such as
860 when interacting with certain older software or software that makes incorrect
861 assumptions about how to parse an eMail address, and this policy makes it
862 easy to accomodate such situations for the tab character, which some users
863 may be including by using the tab key on their keyboards.
864 @see set
865 @see is_policy_tabs_to_spaces
866 @returns The same rmailaddr object so as to facilitate stacking
867 *///=========================================================================
868 rmailaddr& policy_tabs_to_spaces(
869 /// TRUE = convert every tab character to a space@n
870 /// FALSE = do not convert tab characters to spaces (default)
871 bool policy_flag) {
872
873 // --------------------------------------------------------------------------
874 // Update internal policy.
875 // --------------------------------------------------------------------------
876 _policy_tabs_to_spaces = policy_flag;
877
878 // --------------------------------------------------------------------------
879 // Return this object to facilitate stacking.
880 // --------------------------------------------------------------------------
881 return *this;
882
883 } // -x- rmailaddr& policy_tabs_to_spaces -x-
884
885 /*======================================================================*//**
886 @brief
887 Sets the policy for whether to throw exceptions when an error is encountered.
888
889 When this flag is set, errors are tracked internally instead of throwing any
890 exceptions, and will need to be retrieved using the @ref errors() method,
891 which is useful for analyzing an eMail address. (Enabling or disabling this
892 flag does not erase the errors that are stored internally; you will need to
893 use the @ref errors_clear method for this.)
894
895 @warning
896 This policy is not meant for general use in the majority of applications; it
897 is intended for technical analysis, which would be useful in diagnostic and
898 research applications, or packet analysis applications like WireShark, or for
899 advanced users who are interested in more techincal detail.
900 @see errors
901 @see errors_clear
902 @see is_policy_throw_exceptions
903 @returns The same rmailaddr object so as to facilitate stacking
904 *///=========================================================================
905 rmailaddr& policy_throw_exceptions(
906 /// TRUE = throw exceptions (default)@n
907 /// FALSE = don't throw exceptions
908 bool policy_flag) {
909
910 // --------------------------------------------------------------------------
911 // Update internal policy.
912 // --------------------------------------------------------------------------
913 _policy_throw_exceptions = policy_flag;
914
915 // --------------------------------------------------------------------------
916 // Return this object to facilitate stacking.
917 // --------------------------------------------------------------------------
918 return *this;
919
920 } // -x- rmailaddr& policy_throw_exceptions -x-
921
922 /*======================================================================*//**
923 @brief
924 Set a new eMail address, resetting all internal flags, counters, and arrays
925 (but not changing any existing policies). Any existing eMail addresses will
926 be cleared out. (This method is also used internally by most of this class's
927 constructors.)
928 @throws std::invalid_argument describing the problem, along with the byte
929 offset where the problem originated from
930 @see rmailaddr
931 @returns The same rmailaddr object so as to facilitate stacking
932 *///=========================================================================
933 rmailaddr& set(
934 /// RFC-compliant eMail address
935 const char* mailbox,
936 /// Number of characters (-1 = ASCIIZ string)
937 int len = -1) {
938 set((char8_t*)mailbox, len);
939 return *this;
940 } // -x- rmailaddr& set -x-
941
942 /*======================================================================*//**
943 @copydoc set(const char*, int)
944 @see rmailaddr
945 @returns The same rmailaddr object so as to facilitate stacking
946 *///=========================================================================
947 rmailaddr& set(
948 /// RFC-compliant eMail address
949 const std::string& mailbox) {
950 set((char8_t*)mailbox.data(), mailbox.size());
951 return *this;
952 } // -x- rmailaddr& set -x-
953
954 /*======================================================================*//**
955 @copydoc set(const char*, int)
956 @see rmailaddr
957 @returns The same rmailaddr object so as to facilitate stacking
958 *///=========================================================================
959 rmailaddr& set(
960 /// RFC-compliant eMail address
961 const std::u8string& mailbox) {
962 set(mailbox.data(), mailbox.size());
963 return *this;
964 } // -x- rmailaddr& set -x-
965
966 /*======================================================================*//**
967 @copydoc set(const char*, int)
968 @see rmailaddr
969 @returns The same rmailaddr object so as to facilitate stacking
970 *///=========================================================================
971 rmailaddr& set(
972 /// RFC-compliant eMail address
973 const char8_t* mailbox,
974 /// Number of characters (-1 = ASCIIZ string)
975 int len = -1) {
976
977 // --------------------------------------------------------------------------
978 // Measure size of format string if an ASCIIZ string was indicated.
979 // --------------------------------------------------------------------------
980 if (len == -1) len = std::strlen((char*)mailbox);
981
982 // --------------------------------------------------------------------------
983 // Save a copy of the original eMail address.
984 // --------------------------------------------------------------------------
985 _addr.assign(mailbox, len); // We need to save this for later reference
986
987 // --------------------------------------------------------------------------
988 // Pre-adjustments (optional, as per policy flags).
989 // --------------------------------------------------------------------------
990 if (_policy_tabs_to_spaces) // Policy: Convert all tabs to spaces
991 _addr.replace(_addr.begin(), _addr.end(), '\t', ' '); // Efficient replacement
992
993 // --------------------------------------------------------------------------
994 // Internal variables.
995 // --------------------------------------------------------------------------
996 int offset = 0; // Offset within original mailbox char8_t[] array
997 int last_display_name = -1; // Used to build type "e" eMail tokens
998 int last_local_part = -1; // Used to build type "e" eMail tokens
999 int last_domain_part = -1; // Used to build type "e" eMail tokens
1000
1001 // --------------------------------------------------------------------------
1002 // Internal variables that are reset or updated together at various times,
1003 // such as when a token is [in most cases] completed.
1004 // --------------------------------------------------------------------------
1005 int token_begin = 0; // Beginning offset within current portion of string being parsed
1006 char8_t ch; // Character being tested (this needs to be defined outside of the main loop)
1007 bool flag_utf8 = false; // UTF8 character(s) detected
1008 bool flag_angle = false; // Angle-bracket detected
1009 bool flag_quote = false; // Quotation-marks mode detected
1010 bool active_angle = false; // Angle-bracket mode is active
1011 bool active_at_sign = false; // At-sign mode is active (domain-part instead of local-part interpretation)
1012 bool active_quote = false; // Quotation-marks mode is active
1013 int comment_depth = 0; // Comments are active when this value is greater than 0 (too many closed comments are in the negative)
1014 std::u8string p_token; // Processed token data (angle brackets, quotation marks, comments, and whitespace omitted)
1015 std::u8string p_token_sp; // Processed token data (angle brackets, quotation marks, and comments omitted), with spaces preserved
1016
1017 // --------------------------------------------------------------------------
1018 // Main parsing loop that identifies tokens and ensures compliance, and also
1019 // effectively pre-processes eMail addresses on-the-fly for faster access
1020 // from the _emails vector later.
1021 // --------------------------------------------------------------------------
1022 do {
1023
1024 // --------------------------------------------------------------------------
1025 // Obtain next character.
1026 // --------------------------------------------------------------------------
1027 ch = mailbox[offset];
1028
1029 // --------------------------------------------------------------------------
1030 // Compare one character at a time, but first process special cases of quoted
1031 // data (copy most of the data) and comments (ignore the data).
1032 // --------------------------------------------------------------------------
1033 if (flag_quote && active_quote && ch != '"') {
1034 if (QTEXT(ch)) { // Include only quoted text
1035 p_token.push_back(ch);
1036 p_token_sp.push_back(ch);
1037 } // -x- if QTEXT -x-
1038 continue;
1039 } else if (comment_depth > 0 && ch != ')') { // Ignore all comment data
1040 if (_policy_keep_comments) {
1041 p_token.push_back(ch);
1042 p_token_sp.push_back(ch);
1043 } // -x- if _policy_keep_comments -x-
1044 continue;
1045 } else
1046 main_parsing_switch: switch (ch) {
1047
1048 // --------------------------------------------------------------------------
1049 // Group name ends with a colon.
1050 // --------------------------------------------------------------------------
1051 case '"': {
1052 if (!active_quote) { // Enable quotation-marks mode
1053 if (flag_quote) _exception("quotation-marks mode can't be re-opened", offset);
1054 active_quote = true;
1055 flag_quote = true;
1056 } else { // Disable quotation-marks mode
1057 active_quote = false;
1058 }
1059 continue;
1060 } // -x- case " -x-
1061
1062 // --------------------------------------------------------------------------
1063 // Group name ends with a colon.
1064 // --------------------------------------------------------------------------
1065 case ':': {
1066
1067 // --------------------------------------------------------------------------
1068 // Internal tracking.
1069 // --------------------------------------------------------------------------
1070 group_depth++;
1071
1072 // --------------------------------------------------------------------------
1073 // Add this token to the tokens vector.
1074 // --------------------------------------------------------------------------
1075 _tokens.push_back({ .type = 'g',
1076 .offset = token_begin,
1077 .len = offset - token_begin,
1078 .flag_utf8 = flag_utf8,
1079 .p_token = p_token_sp, });
1080
1081 // --------------------------------------------------------------------------
1082 // Reset and prepare internal variables for the next token.
1083 // --------------------------------------------------------------------------
1084 RESET_FOR_NEXT_TOKEN;
1085 continue;
1086
1087 } // -x- case : -x-
1088
1089 // --------------------------------------------------------------------------
1090 // Group of eMail addresses is terminated by a semi-colon.
1091 // --------------------------------------------------------------------------
1092 case ';': {
1093
1094 // --------------------------------------------------------------------------
1095 // Internal tracking.
1096 // --------------------------------------------------------------------------
1097 if (--group_depth < 0) _exception("too many group construct terminators", offset);
1098 if (active_angle) _exception("unbalanced open angle bracket", offset);
1099
1100 // --------------------------------------------------------------------------
1101 // Add this token terminator to the tokens vector.
1102 // --------------------------------------------------------------------------
1103 _tokens.push_back({ .type = ';',
1104 .offset = token_begin,
1105 .len = offset - token_begin,
1106 .flag_utf8 = flag_utf8,
1107 .p_token = p_token_sp, });
1108
1109 // --------------------------------------------------------------------------
1110 // Reset and prepare internal variables for the next token.
1111 // --------------------------------------------------------------------------
1112 RESET_FOR_NEXT_TOKEN;
1113 continue;
1114
1115 } // -x- case ; -x-
1116
1117 // --------------------------------------------------------------------------
1118 // Opening angle bracket.
1119 // --------------------------------------------------------------------------
1120 case '<': {
1121
1122 // --------------------------------------------------------------------------
1123 // Internal tracking.
1124 // --------------------------------------------------------------------------
1125 if (flag_angle) _exception("unbalanced open angle bracket", offset);
1126 active_angle = true;
1127 flag_angle = true;
1128
1129 // --------------------------------------------------------------------------
1130 // Add this token terminator to the tokens vector if a display-name exists.
1131 // --------------------------------------------------------------------------
1132 if (token_begin < offset) {
1133 last_display_name = _tokens.size();
1134 _tokens.push_back({ .type = 'n',
1135 .offset = token_begin,
1136 .len = offset - token_begin,
1137 .flag_utf8 = flag_utf8,
1138 .p_token = p_token_sp, });
1139 } // -x- if token_begin -x-
1140
1141 // --------------------------------------------------------------------------
1142 // Reset and prepare internal variables for the next token.
1143 // --------------------------------------------------------------------------
1144 RESET_FOR_NEXT_TOKEN;
1145 continue;
1146
1147 } // -x- case < -x-
1148
1149 // --------------------------------------------------------------------------
1150 // At sign ("@") delimiter.
1151 // --------------------------------------------------------------------------
1152 case '@': {
1153
1154 // --------------------------------------------------------------------------
1155 // Internal tracking.
1156 // --------------------------------------------------------------------------
1157 if (active_at_sign) _exception("too many at (\"@\") signs", offset);
1158 active_at_sign = true;
1159
1160 // --------------------------------------------------------------------------
1161 // Add this token terminator to the tokens vector if a display-name exists.
1162 // --------------------------------------------------------------------------
1163 last_local_part = _tokens.size();
1164 _tokens.push_back({ .type = 'l',
1165 .offset = token_begin,
1166 .len = offset - token_begin,
1167 .flag_utf8 = flag_utf8,
1168 .flag_angle = flag_angle,
1169 .p_token = p_token, });
1170
1171 // --------------------------------------------------------------------------
1172 // Reset and prepare internal variables for the next token.
1173 // --------------------------------------------------------------------------
1174 RESET_FOR_NEXT_TOKEN;
1175 continue;
1176
1177 } // -x- case @ -x-
1178
1179 // --------------------------------------------------------------------------
1180 // Closing angle-bracket.
1181 // --------------------------------------------------------------------------
1182 case '>': {
1183
1184 // --------------------------------------------------------------------------
1185 // Internal tracking.
1186 // --------------------------------------------------------------------------
1187 if (!active_angle) _exception("unbalanced closing angle bracket", offset);
1188 active_angle = false;
1189 goto main_parsing_email;
1190
1191 // --------------------------------------------------------------------------
1192 // Reset and prepare internal variables for the next token.
1193 // --------------------------------------------------------------------------
1194 RESET_FOR_NEXT_TOKEN;
1195 continue;
1196
1197 } // -x- case > -x-
1198
1199 // --------------------------------------------------------------------------
1200 // Comma delimiter, signifies the end of an eMail address.
1201 // --------------------------------------------------------------------------
1202 case ',': {
1203
1204 main_parsing_comma:
1205 // --------------------------------------------------------------------------
1206 // Internal tracking.
1207 // --------------------------------------------------------------------------
1208 if (active_quote) _exception("unbalanced quotation-marks", offset);
1209 if (active_angle) _exception("unbalanced open angle bracket before comma", offset);
1210
1211 main_parsing_email:
1212 // --------------------------------------------------------------------------
1213 // Add this token terminator to the tokens vector if a display-name exists.
1214 // --------------------------------------------------------------------------
1215 if (active_at_sign) { // Domain-part has been started
1216 last_domain_part = _tokens.size();
1217 _tokens.push_back({ .type = 'd',
1218 .offset = token_begin,
1219 .len = offset - token_begin,
1220 .flag_utf8 = flag_utf8,
1221 .flag_angle = _tokens[last_local_part].flag_angle,
1222 .p_token = p_token, });
1223 active_at_sign = false;
1224 } else { // Domain-part has not been started, so there's only a local-part here
1225 last_local_part = _tokens.size();
1226 _tokens.push_back({ .type = 'l',
1227 .offset = token_begin,
1228 .len = offset - token_begin,
1229 .flag_utf8 = flag_utf8,
1230 .flag_angle = flag_angle,
1231 .p_token = p_token, });
1232 } // -x- if active_at_sign -x-
1233
1234 // --------------------------------------------------------------------------
1235 // Perform a few checks to make sure we're not creating phantom addresses.
1236 // --------------------------------------------------------------------------
1237 int __email_len = last_domain_part == -1 ? _tokens[last_local_part].len : (_tokens[last_domain_part].offset - _tokens[last_local_part].offset) + _tokens[last_domain_part].len;
1238//std::cout << "__email_len=" << std::to_string(__email_len) << std::endl;
1239 if (__email_len == 0 && !flag_angle) continue;
1240//std::cout << "last_local_part=" << std::to_string(last_local_part) << std::endl;
1241//std::cout << "last_domain_part=" << std::to_string(last_domain_part) << std::endl;
1242
1243 // --------------------------------------------------------------------------
1244 // Create a token of type "e" now that this eMail address is closed.
1245 //
1246 // The reason we're calculating size based on offsets instead of by adding
1247 // sizes together (and adding 1 for the "@" sign) is that commants can be
1248 // included in the localpart portion, which normally won't be counted in any
1249 // localpart sizes.
1250 // --------------------------------------------------------------------------
1251 _index_e.push_back(_tokens.size()); // Add to index of eMail addresses (before adding to _tokens vector, _tokens.size() is the position)
1252 _tokens.push_back({ .type = 'e',
1253 .offset = _tokens[last_local_part].offset,
1254 .len = __email_len,// - token_begin,
1255 .flag_utf8 = _tokens[last_local_part].flag_utf8 || flag_utf8,
1256 .flag_angle = _tokens[last_local_part].flag_angle,
1257 .flag_null_addr = __email_len == 0,
1258 .p_token = _tokens[last_local_part].p_token + ((last_domain_part == -1 || _tokens[last_domain_part].p_token.empty()) ? u8"" : u8"@" + _tokens[last_domain_part].p_token),
1259 .index_display_name = last_display_name,
1260 .index_local_part = last_local_part,
1261 .index_domain_part = last_domain_part, });
1262 last_display_name = -1;
1263 last_local_part = -1;
1264 last_domain_part = -1;
1265 flag_angle = false;
1266
1267 // --------------------------------------------------------------------------
1268 // Reset and prepare internal variables for the next token.
1269 // --------------------------------------------------------------------------
1270 RESET_FOR_NEXT_TOKEN;
1271 continue;
1272
1273 } // -x- case , -x-
1274
1275 // --------------------------------------------------------------------------
1276 // Opening comment parenthesis.
1277 // --------------------------------------------------------------------------
1278 case '(': {
1279 comment_depth++;
1280 continue;
1281 } // -x- case ( -x-
1282
1283 // --------------------------------------------------------------------------
1284 // Closing comment parenthesis.
1285 // --------------------------------------------------------------------------
1286 case ')': {
1287 if (--comment_depth < 0) _exception("unbalanced closing comment parenthesis", offset);
1288 continue;
1289 } // -x- case ) -x-
1290
1291 // --------------------------------------------------------------------------
1292 // Backslash (quote-literal).
1293 // --------------------------------------------------------------------------
1294 case '\\': {
1295
1296 // --------------------------------------------------------------------------
1297 // Prevent a potential out-of-bounds buffer-overrun problem.
1298 // --------------------------------------------------------------------------
1299 if (++offset == len) {
1300 _exception("unbalanced quote-literal (backslash)", offset);
1301 continue; // Do this in case we're not throwing exceptions
1302 } // -x- if offset -x-
1303
1304 // --------------------------------------------------------------------------
1305 // Update to next character (whatever it is, we're taking it literally).
1306 // --------------------------------------------------------------------------
1307 ch = mailbox[offset];
1308 goto main_parsing_loop_default; // Fall-through to default
1309
1310 } // -x- case \ -x-
1311
1312 // --------------------------------------------------------------------------
1313 // All remaining characters.
1314 // --------------------------------------------------------------------------
1315 default:
1316 //if (flag_angle) _exception("additional data not permitted", offset);
1317 main_parsing_loop_default:
1318 if (ch > 127) { // Include all UTF-8 character (unless prevented by the exception)
1319 flag_utf8 = true;
1320 if (!_policy_support_utf8) _exception("UTF-8 byte encountered", offset);
1321 p_token.push_back(ch);
1322 p_token_sp.push_back(ch);
1323 } else if (CTEXT(ch) || ' ') { // Include almost everything for now (including spaces)
1324 if (ch != ' ') p_token.push_back(ch); // Exclude spaces
1325 if (!(ch == ' ' && p_token.size() == 0)) p_token_sp.push_back(ch); // Keep spaces
1326 } // -x- if ch -x-
1327
1328 } // -x- switch ch -x-
1329
1330 } while (++offset < len); // -x- do while -x-
1331
1332 // --------------------------------------------------------------------------
1333 // If the final token isn't empty (a.k.a., unfinished / not sealed), then
1334 // figure out what to do and run one more time, or else throw an exception.
1335 // --------------------------------------------------------------------------
1336 if (offset == len && token_begin < offset) {
1337 ch = ','; // Force comma (",") on parsing loop
1338 goto main_parsing_switch;
1339 } else if (offset > len && token_begin < offset) {
1340 _exception("incomplete data", offset - 1);
1341 } // -x- if offset -x-
1342 return *this;
1343
1344 } // -x- rmailaddr& set -x-
1345
1346 /*======================================================================*//**
1347 @brief
1348 Find out how many eMail addresses this object holds.
1349 @see empty
1350 @see has_any
1351 @see has_multiple
1352 @see has_one
1353 @returns The number of eMail addresses
1354 *///=========================================================================
1355 int size() {
1356 return _index_e.size();
1357 } // -x- int size -x-
1358
1359 /*======================================================================*//**
1360 @brief
1361 Generate a detailed output of all tokens that's useful for debugging.
1362
1363 @code
1364 Types:
1365 g = group name (beginning; includes colon)
1366 ; = group termination (semi-colon character)
1367 n = display name
1368 e = eMail address (includes angle brackets, if present)
1369 l = local-part
1370 d = domain-part
1371 c = comment (not implemented)
1372 \0 = not initialized (null; regard as "unknown"; this should never happen)
1373 @endcode
1374
1375 The difference between "token" and "p_token" is that "token" is the original
1376 and [mostly] unprocessed atom, while "p_token" has been processed with any
1377 sets of angle-brackets, sets of quotation-marks, comments, and whitespace
1378 removed. In nearly all instances, the value of "p_token" is what's needed.
1379 @returns std::string containing multi-line text (one token per line)
1380 *///=========================================================================
1381 std::string tokens_to_string(
1382 /// Filter (string containing characters for those types that are to be
1383 /// included {unrecognized types will be ignored}; the default is no filter)
1384 const std::string& filter = "",
1385 /// Prefix (text to insert before the beginning of each line)
1386 const std::string& prefix = "",
1387 /// End-of-Line sequence (default is "\n")
1388 const std::string& eol = "\n") {
1389
1390 // --------------------------------------------------------------------------
1391 // Internal variables.
1392 // --------------------------------------------------------------------------
1393 std::string t;
1394
1395 // --------------------------------------------------------------------------
1396 // Loop that builds list of tokens (one per line).
1397 // --------------------------------------------------------------------------
1398 for (int i = 0; i < _tokens.size(); i++) {
1399
1400 // --------------------------------------------------------------------------
1401 // Check filter.
1402 // --------------------------------------------------------------------------
1403 if (filter.empty() || filter.find(_tokens[i].type) != std::string::npos) {
1404
1405 // --------------------------------------------------------------------------
1406 // Shared characteristics.
1407 // --------------------------------------------------------------------------
1408 t.append(prefix + "index=" + std::to_string(i)
1409 + " type=" + _tokens[i].type
1410 + " utf8=" + (_tokens[i].flag_utf8 ? "y" : "n")
1411 + " punycode=" + (_tokens[i].flag_punycode ? "y" : "n")
1412 + " obsolete=" + (_tokens[i].flag_obsolete ? "y" : "n")
1413 + " offset=" + std::to_string(_tokens[i].offset)
1414 + " length=" + std::to_string(_tokens[i].len)
1415 + " token=" + std::string((char*)_addr.c_str()).substr(_tokens[i].offset, _tokens[i].len)
1416 + " p_token=" + (char*)_tokens[i].p_token.c_str());
1417
1418 // --------------------------------------------------------------------------
1419 // Type-specific characteristics.
1420 // --------------------------------------------------------------------------
1421 switch (_tokens[i].type) {
1422 case 'd':
1423 t.append(std::string( " fqdn=") + (_tokens[i].flag_fqdn ? "y" : "n"));
1424 break;
1425 case 'e':
1426 t.append(std::string( " angle=") + (_tokens[i].flag_angle ? "y" : "n"));
1427 // Fall-through to type "l"
1428 case 'l':
1429 t.append(std::string(" null_addr=") + (_tokens[i].flag_null_addr ? "y" : "n"));
1430 break;
1431 } // -x- switch type -x-
1432
1433 // --------------------------------------------------------------------------
1434 // Final EoL (End of Line) sequence.
1435 // --------------------------------------------------------------------------
1436 t.append(eol);
1437
1438 } // -x- if filter -x-
1439
1440 } // -x- for i -x-
1441 return t;
1442
1443 } // -x- std::string tokens_to_string -x-
1444
1445 /*======================================================================*//**
1446 @brief
1447 Array-style access to eMail addresses. The first element is at index 0.
1448 @see get
1449 @see domain_part
1450 @see local_part
1451 @returns std::u8string with only the eMail address (no angle brackets, etc.)
1452 as a native UTF-8 string
1453 *///=========================================================================
1454 std::u8string operator[](
1455 /// Index of eMail address to query for (0 = first element; negative index
1456 /// values are calculated in reverse, starting with -1 as the final position)
1457 const int index) {
1458 return _tokens[_index_e[index >= 0 ? index : _index_e.size() + index]].p_token;
1459 } // -x- std::u8string operator[] -x-
1460
1461 /*======================================================================*//**
1462 @brief
1463 Support convenient streaming usage with std::cout, std::cerr, and friends.
1464 @returns eMail address in human-readable form
1465 *///=========================================================================
1466 friend std::ostream& operator<< (
1467 /// Output stream (provided automatically by std::cout and std::cerr)
1468 std::ostream& o,
1469 /// Object class (matched by compiler)
1470 rmailaddr const& c) {
1471 return o << (char*)c._addr.c_str();
1472 } // -x- std::ostream& operator<< -x-
1473
1474 }; // -x- class rmailaddr -x-
1475
1476}; // -x- namespace randolf -x-