3// #include <randolf/rhostname_flags>
5#include <idn2.h> // apt install libidn2-dev
9 /*======================================================================*//**
11 Internal structure that @ref rhostname uses to store @ref rlabel portions, of
12 which at least one comprises a hostname.
15 See the `UTF-8 Everywhere` web site for simple, straight-forward, and helpful
16 insights and advice on how to properly work with UTF8 data in C++ (and other
17 languages) at: https://www.utf8everywhere.org/
19 @author Randolf Richardson
22 - 2023-Apr-27 v1.00 Initial version
23 - 2025-Feb-03 v1.00 Increased use of references and pointers
26 *///=========================================================================
30 /// The DNS RR flag indicates that this label is derived from a DNS RR
31 /// wherein the first character indicates the length of the label in bytes)
33 /// The UTF-8 flag indicates that this label is an internationalized label
34 /// because it contains at least one UTF-8 sequence
36 /// The xn flag indicates that this label seems to be in punycode format,
37 /// because it begins with the sequence "xn--" and is at least 5 bytes long
39 /// Conversion return code (@c IDN2_OK = successful conversion to UTF8 or
40 /// punycode; or this is the default because no conversion was performed)
44 /*======================================================================*//**
46 Optional flags that alter, modify, or enhance the operation of hostname
48 *///=========================================================================
49 enum HOSTNAME_FLAGS: int {
51 /*----------------------------------------------------------------------*//**
52 The HOSTNAME_DEFAULT flag isn't necessary, but it's included here for
53 completeness as it accomodates programming styles that prefer to emphasize
54 when defaults are being relied upon.
55 *///-------------------------------------------------------------------------
58 /*----------------------------------------------------------------------*//**
59 Alternate between an IP address and its IN-ADDR.ARPA formatted counter-part.
61 This causes an IP address's octets (IPv4 format) or segments (IPv6 format),
62 when supplied where a hostname would normally be expected, to be effectively
65 Or, deconvert for output an effectively-reversed IP-address-as-a-hostname in
66 the appropriate IPv4/IPv6 format. (All remaining labels will be removed.)
69 This format is commonly used to query DNS-based blocklists/blacklists,
70 greylists, whitelists, etc.
71 *///-------------------------------------------------------------------------
74 /*----------------------------------------------------------------------*//**
75 Convert path to hostname. This converts a path, delimited by slashes (as is
76 standard in UNIX, Linux, etc., as well as with internet URIs), to a hostname
77 with the top-level portion corresponding to the top-level label as used in
78 the Domain Name System, the second-level portion with the second-level label,
81 For example: The path "/internet/com/example/www/" gets converted into the
82 DNS hostname "www.example.com.internet" (to remove the top/last
83 label, use the @c remove(-1) method after conversion, or use
84 the @c path_to_hostname method which provides an additional
85 parameter to specify how many elements of the path to skip from
86 both the beginning and the end).
87 *///-------------------------------------------------------------------------
88 HOSTNAME_FROM_PATH = 2,
90 /*----------------------------------------------------------------------*//**
91 Don't throw exceptions when parsing a hostname string when the format is
92 invalid (or the data is corrupt).
94 When an index parameter is out of range, automatically truncate it to the
95 maximum bounds of any underlying vector instead of throwing an exception when
96 an index is out of range.
98 This is useful for evaluating a hostname from a data source that is intended
99 to be reported on in a diagnostic fashion, or as part of a debugging effort.
100 *///-------------------------------------------------------------------------
101 HOSTNAME_WITHOUT_EXCEPTIONS = 4,
103 /*----------------------------------------------------------------------*//**
104 If hostname is an FQDN, then include the final period at the end (used by the
105 @ref rhostname method, primarily).
107 (The "OPT" portion of this flag name means "optional.")
108 *///-------------------------------------------------------------------------
109 HOSTNAME_FQDN_OPT = 8,
111 /*----------------------------------------------------------------------*//**
112 Convert label from/to DNS RR format (this is mostly only useful for authors
113 of DNS client or server software, or raw DNS packet analysis code).
115 This format is used in DNS packets, and so this flag could be useful in
116 software projects like DNS resolvers, DNS daemons, and DNS packet analyzers.
117 *///-------------------------------------------------------------------------
118 HOSTNAME_DNS_RR = 16,
120 /*----------------------------------------------------------------------*//**
121 Convert label from/to normal UTF-8 data.
124 The HOSTNAME_UTF8 and HOSTNAME_XN flags represent two exclusively different
125 formats, and combining them in for a hostname (or for an individual label) is
126 likely to yield unpredictable results (the order in which these two flags are
127 tested will vary depending on which blocks of code are doing the processing,
128 and the programming logic being used; and future updates to the code may also
129 affect changes to the order of flag tests and the programming logic).
131 This format is normally presented to users to present hostnames natively in
134 *///-------------------------------------------------------------------------
137 /*----------------------------------------------------------------------*//**
138 Convert label from/to punycode format wherein the first four characters will
139 begin with the ASCII sequence "xn--" if any UTF-8 sequences are present.
142 The HOSTNAME_XN and HOSTNAME_UTF8 flags represent two exclusively different
143 formats, and combining them in for a hostname (or for an individual label) is
144 likely to yield unpredictable results (the order in which these two flags are
145 tested will vary depending on which blocks of code are doing the processing,
146 and the programming logic being used; and future updates to the code may also
147 affect changes to the order of flag tests and the programming logic).
149 This format is used by client software (e.g., eMail applicaitons and web
150 browsers) when attempting to resolve the IP addresses of remote hosts or
151 lookup other DNS records.
153 *///-------------------------------------------------------------------------
156 }; // -x- enum HOSTNAME_FLAGS -x-
158 /*======================================================================*//**
160 Structure constructor that detects whether any UTF-8 sequence is present in
161 the data and/or if it's already in punycode format (e.g., it begins with the
164 Although the default behaviour is to not convert the data between UTF-8 and
165 punycode, either the @ref HOSTNAME_UTF8 or @ref HOSTNAME_XN flag may be
166 specified to cause such a conversion for underlying storage. The benefit of
167 using these flags here is mostly for performance optimization where repeated
168 outbound conversions can be prevented when accessing the data multiple times.
169 @throws std::invalid_argument If the label is blank or there is a problem
170 with the format (e.g., invalid characters)
171 *///=========================================================================
174 const std::string& data,
175 /// @ref HOSTNAME_DNS_RR Convert label from DNS RR format@n
176 /// @ref HOSTNAME_UTF8 Convert label to raw UTF-8 format (optional)@n
177 /// @ref HOSTNAME_XN Convert label to punycode format (optional)
178 const int flags = HOSTNAME_DEFAULT) {
180 // --------------------------------------------------------------------------
182 // --------------------------------------------------------------------------
183 if (data.empty()) throw std::invalid_argument("rlabel is empty");
185 // --------------------------------------------------------------------------
186 // Convert label from DNS RR format where the first byte specifies the length
187 // of the remaining data that follows (the std::min function is used to
188 // prevent a buffer overrun by automatically truncating the data; this will
189 // already have been vetted by rhostname::hostname, so we don't need to do
190 // this again here at this time).
191 // --------------------------------------------------------------------------
192 this->data = (flags & HOSTNAME_DNS_RR) ? data.substr(1, std::min((size_t)((u_char)data.front()), data.size()))
195 // --------------------------------------------------------------------------
196 // Peform conversions, depending on the flags. Our switch statement's scope
197 // is limited only to specific flags, which are logically OR'd together so as
198 // to handle multi-purposed bitfield options in an elegant fashion since each
199 // of these particular flags is exclusive.
200 // --------------------------------------------------------------------------
201 switch (flags & (HOSTNAME_UTF8 | HOSTNAME_XN)) {
202 case HOSTNAME_UTF8: // Convert to UTF8
203 if (this->data.starts_with("xn--")) { // Label is in punycode format, so it doesn't yet qualify as being in UTF-8 format
205 idn2rc = idn2_to_unicode_8z8z(this->data.data(), &p, 0);
206 if (idn2rc == IDN2_OK) this->data = std::string((char*)p);
207 if (p != nullptr) idn2_free(p);
208 } // -x- if ^xn-- -x-
210 case HOSTNAME_XN: // Convert to punycode
211 if (!this->data.starts_with("xn--")) { // Label is not already in punycode format
213 idn2rc = idn2_to_ascii_8z(this->data.data(), &p, 0);
214 if (idn2rc == IDN2_OK) this->data = std::string((char*)p);
215 if (p != nullptr) idn2_free(p);
216 } // -x- if !xn-- -x-
218 } // -x- switch flags -x-
220 // --------------------------------------------------------------------------
221 // Record whether this label is already in punycode format.
223 // Note: A 4-byte label of "xn--" is not valid punycode, so we intentionally
224 // check for a "greater than 4" string size (this is not erroneous).
225 // --------------------------------------------------------------------------
226 if (this->data.size() > 4 && this->data.starts_with("xn--")) xn = true;
228 // --------------------------------------------------------------------------
229 // Loop through string to determine UTF8 (any character is greater than 127).
230 // --------------------------------------------------------------------------
231 for (int i = this->data.size() - 1; i > 0; i--) {
232 if ((u_char)this->data[i] >= 128) { // UTF8 byte detected
238//std::cout << "Label: " << this->data << std::endl; // Debug
239//std::cout << " XN: " << xn << std::endl; // Debug
240//std::cout << " UTF8: " << utf8 << std::endl; // Debug
242 } // -x- constructor rlabel -x-
245// std::cout << "Destructor: rlabel = " << data << std::endl; // Debug
248 /*======================================================================*//**
250 Provice label as an std::string with any needed conversions.
251 @returns Label as an std::string object
252 *///=========================================================================
254 /// @ref HOSTNAME_DNS_RR Convert label to DNS RR format@n
255 /// @ref HOSTNAME_UTF8 Convert label to raw UTF-8 format (optional)@n
256 /// @ref HOSTNAME_XN Convert label to punycode format (optional)
257 const int flags = HOSTNAME_DEFAULT) {
259 // --------------------------------------------------------------------------
260 // Internal variables.
261 // --------------------------------------------------------------------------
264 // --------------------------------------------------------------------------
265 // Peform conversions, depending on the flags. Our switch statement's scope
266 // is limited only to specific flags, which are logically OR'd together so as
267 // to handle multi-purposed bitfield options in an elegant fashion since each
268 // of these particular flags is exclusive.
269 // --------------------------------------------------------------------------
270 switch (flags & (HOSTNAME_UTF8 | HOSTNAME_XN)) {
271 case HOSTNAME_UTF8: // Convert to UTF8
272 if (data.starts_with("xn--")) { // Label is in punycode format, so it doesn't yet qualify as being in UTF-8 format
274 idn2rc = idn2_to_unicode_8z8z(data.data(), &p, 0);
275 if (idn2rc == IDN2_OK) _label.assign((char*)p);
276 if (p != nullptr) idn2_free(p);
279 } // -x- if ^xn-- -x-
281 case HOSTNAME_XN: // Convert to punycode
282 if (!data.starts_with("xn--")) { // Label is not already in punycode format
284 idn2rc = idn2_to_ascii_8z(data.data(), &p, 0);
285 if (idn2rc == IDN2_OK) _label.assign((char*)p);
286 if (p != nullptr) idn2_free(p);
289 } // -x- if !xn-- -x-
291 default: // No conversions necessary, so just assign the label as is
293 } // -x- switch flags -x-
295 // --------------------------------------------------------------------------
296 // Insert label's length (maximum 255 characters) if it is to be converted to
298 // --------------------------------------------------------------------------
299 if (flags & HOSTNAME_DNS_RR)
300 _label.insert(0, // Position: before first character
301 1, // Insert only one character
302 (u_char)(std::min(_label.size(), (size_t)255))); // Convert size to u_char, limit to 255
304 // --------------------------------------------------------------------------
306 // --------------------------------------------------------------------------
309 } // -x- std::string get -x-
311 /*======================================================================*//**
313 Built-in comparison operator used by @c std::set for ordering rlabel
314 objects by the underlying label data.
315 @returns Underlying rsocket_label
316 *///=========================================================================
318 /// This rlabel structure
319 const rlabel& rhl) const {
320 return data < rhl.data;
323 }; // -x- struct rlabel -x-
325}; // -x- namespace randolf -x-