3// #include <randolf/rhostname_flags>
5#include <idn2.h> // apt install libidn2-dev
9 /*======================================================================*//**
11 Internal structure that @ref rhostname uses to store @ref rlabel portions, of
12 which at least one comprises a hostname.
15 See the `UTF-8 Everywhere` web site for simple, straight-forward, and helpful
16 insights and advice on how to properly work with UTF8 data in C++ (and other
17 languages) at: https://www.utf8everywhere.org/
19 *///=========================================================================
23 /// The DNS RR flag indicates that this label is derived from a DNS RR
24 /// wherein the first character indicates the length of the label in bytes)
26 /// The UTF-8 flag indicates that this label is an internationalized label
27 /// because it contains at least one UTF-8 sequence
29 /// The xn flag indicates that this label seems to be in punycode format,
30 /// because it begins with the sequence "xn--" and is at least 5 bytes long
32 /// Conversion return code (@c IDN2_OK = successful conversion to UTF8 or
33 /// punycode; or this is the default because no conversion was performed)
37 /*======================================================================*//**
39 Optional flags that alter, modify, or enhance the operation of hostname
41 *///=========================================================================
42 enum HOSTNAME_FLAGS: int {
44 /*----------------------------------------------------------------------*//**
45 The HOSTNAME_DEFAULT flag isn't necessary, but it's included here for
46 completeness as it accomodates programming styles that prefer to emphasize
47 when defaults are being relied upon.
48 *///-------------------------------------------------------------------------
51 /*----------------------------------------------------------------------*//**
52 Alternate between an IP address and its IN-ADDR.ARPA formatted counter-part.
54 This causes an IP address's octets (IPv4 format) or segments (IPv6 format),
55 when supplied where a hostname would normally be expected, to be effectively
58 Or, deconvert for output an effectively-reversed IP-address-as-a-hostname in
59 the appropriate IPv4/IPv6 format. (All remaining labels will be removed.)
62 This format is commonly used to query DNS-based blocklists/blacklists,
63 greylists, whitelists, etc.
64 *///-------------------------------------------------------------------------
67 /*----------------------------------------------------------------------*//**
68 Convert path to hostname. This converts a path, delimited by slashes (as is
69 standard in UNIX, Linux, etc., as well as with internet URIs), to a hostname
70 with the top-level portion corresponding to the top-level label as used in
71 the Domain Name System, the second-level portion with the second-level label,
74 For example: The path "/internet/com/example/www/" gets converted into the
75 DNS hostname "www.example.com.internet" (to remove the top/last
76 label, use the @c remove(-1) method after conversion, or use
77 the @c path_to_hostname method which provides an additional
78 parameter to specify how many elements of the path to skip from
79 both the beginning and the end).
80 *///-------------------------------------------------------------------------
81 HOSTNAME_FROM_PATH = 2,
83 /*----------------------------------------------------------------------*//**
84 Don't throw exceptions when parsing a hostname string when the format is
85 invalid (or the data is corrupt).
87 When an index parameter is out of range, automatically truncate it to the
88 maximum bounds of any underlying vector instead of throwing an exception when
89 an index is out of range.
91 This is useful for evaluating a hostname from a data source that is intended
92 to be reported on in a diagnostic fashion, or as part of a debugging effort.
93 *///-------------------------------------------------------------------------
94 HOSTNAME_WITHOUT_EXCEPTIONS = 4,
96 /*----------------------------------------------------------------------*//**
97 If hostname is an FQDN, then include the final period at the end (used by the
98 @ref rhostname method, primarily).
100 (The "OPT" portion of this flag name means "optional.")
101 *///-------------------------------------------------------------------------
102 HOSTNAME_FQDN_OPT = 8,
104 /*----------------------------------------------------------------------*//**
105 Convert label from/to DNS RR format (this is mostly only useful for authors
106 of DNS client or server software, or raw DNS packet analysis code).
108 This format is used in DNS packets, and so this flag could be useful in
109 software projects like DNS resolvers, DNS daemons, and DNS packet analyzers.
110 *///-------------------------------------------------------------------------
111 HOSTNAME_DNS_RR = 16,
113 /*----------------------------------------------------------------------*//**
114 Convert label from/to normal UTF-8 data.
117 The HOSTNAME_UTF8 and HOSTNAME_XN flags represent two exclusively different
118 formats, and combining them in for a hostname (or for an individual label) is
119 likely to yield unpredictable results (the order in which these two flags are
120 tested will vary depending on which blocks of code are doing the processing,
121 and the programming logic being used; and future updates to the code may also
122 affect changes to the order of flag tests and the programming logic).
124 This format is normally presented to users to present hostnames natively in
127 *///-------------------------------------------------------------------------
130 /*----------------------------------------------------------------------*//**
131 Convert label from/to punycode format wherein the first four characters will
132 begin with the ASCII sequence "xn--" if any UTF-8 sequences are present.
135 The HOSTNAME_XN and HOSTNAME_UTF8 flags represent two exclusively different
136 formats, and combining them in for a hostname (or for an individual label) is
137 likely to yield unpredictable results (the order in which these two flags are
138 tested will vary depending on which blocks of code are doing the processing,
139 and the programming logic being used; and future updates to the code may also
140 affect changes to the order of flag tests and the programming logic).
142 This format is used by client software (e.g., eMail applicaitons and web
143 browsers) when attempting to resolve the IP addresses of remote hosts or
144 lookup other DNS records.
146 *///-------------------------------------------------------------------------
149 }; // -x- enum HOSTNAME_FLAGS -x-
151 /*======================================================================*//**
153 Structure constructor that detects whether any UTF-8 sequence is present in
154 the data and/or if it's already in punycode format (e.g., it begins with the
157 Although the default behaviour is to not convert the data between UTF-8 and
158 punycode, either the @ref HOSTNAME_UTF8 or @ref HOSTNAME_XN flag may be
159 specified to cause such a conversion for underlying storage. The benefit of
160 using these flags here is mostly for performance optimization where repeated
161 outbound conversions can be prevented when accessing the data multiple times.
162 @throws std::invalid_argument If the label is blank or there is a problem
163 with the format (e.g., invalid characters)
164 *///=========================================================================
167 const std::string data,
168 /// @ref HOSTNAME_DNS_RR Convert label from DNS RR format@n
169 /// @ref HOSTNAME_UTF8 Convert label to raw UTF-8 format (optional)@n
170 /// @ref HOSTNAME_XN Convert label to punycode format (optional)
171 const int flags = HOSTNAME_DEFAULT) {
173 // --------------------------------------------------------------------------
175 // --------------------------------------------------------------------------
176 if (data.empty()) throw std::invalid_argument("rlabel is empty");
178 // --------------------------------------------------------------------------
179 // Convert label from DNS RR format where the first byte specifies the length
180 // of the remaining data that follows (the std::min function is used to
181 // prevent a buffer overrun by automatically truncating the data; this will
182 // already have been vetted by rhostname::hostname, so we don't need to do
183 // this again here at this time).
184 // --------------------------------------------------------------------------
185 this->data = (flags & HOSTNAME_DNS_RR) ? data.substr(1, std::min((size_t)((u_char)data.front()), data.size()))
188 // --------------------------------------------------------------------------
189 // Peform conversions, depending on the flags. Our switch statement's scope
190 // is limited only to specific flags, which are logically OR'd together so as
191 // to handle multi-purposed bitfield options in an elegant fashion since each
192 // of these particular flags is exclusive.
193 // --------------------------------------------------------------------------
194 switch (flags & (HOSTNAME_UTF8 | HOSTNAME_XN)) {
195 case HOSTNAME_UTF8: // Convert to UTF8
196 if (this->data.starts_with("xn--")) { // Label is in punycode format, so it doesn't yet qualify as being in UTF-8 format
198 idn2rc = idn2_to_unicode_8z8z(this->data.data(), &p, 0);
199 if (idn2rc == IDN2_OK) this->data = std::string((char*)p);
200 if (p != nullptr) idn2_free(p);
201 } // -x- if ^xn-- -x-
203 case HOSTNAME_XN: // Convert to punycode
204 if (!this->data.starts_with("xn--")) { // Label is not already in punycode format
206 idn2rc = idn2_to_ascii_8z(this->data.data(), &p, 0);
207 if (idn2rc == IDN2_OK) this->data = std::string((char*)p);
208 if (p != nullptr) idn2_free(p);
209 } // -x- if !xn-- -x-
211 } // -x- switch flags -x-
213 // --------------------------------------------------------------------------
214 // Record whether this label is already in punycode format.
216 // Note: A 4-byte label of "xn--" is not valid punycode, so we intentionally
217 // check for a "greater than 4" string size (this is not erroneous).
218 // --------------------------------------------------------------------------
219 if (this->data.size() > 4 && this->data.starts_with("xn--")) xn = true;
221 // --------------------------------------------------------------------------
222 // Loop through string to determine UTF8 (any character is greater than 127).
223 // --------------------------------------------------------------------------
224 for (int i = this->data.size() - 1; i > 0; i--) {
225 if ((u_char)this->data[i] >= 128) { // UTF8 byte detected
231//std::cout << "Label: " << this->data << std::endl; // Debug
232//std::cout << " XN: " << xn << std::endl; // Debug
233//std::cout << " UTF8: " << utf8 << std::endl; // Debug
235 }; // -x- constructor rlabel -x-
238// std::cout << "Destructor: rlabel = " << data << std::endl; // Debug
241 /*======================================================================*//**
243 Provice label as an std::string with any needed conversions.
244 @returns Label as an std::string object
245 *///=========================================================================
247 /// @ref HOSTNAME_DNS_RR Convert label to DNS RR format@n
248 /// @ref HOSTNAME_UTF8 Convert label to raw UTF-8 format (optional)@n
249 /// @ref HOSTNAME_XN Convert label to punycode format (optional)
250 const int flags = HOSTNAME_DEFAULT) {
252 // --------------------------------------------------------------------------
253 // Internal variables.
254 // --------------------------------------------------------------------------
257 // --------------------------------------------------------------------------
258 // Peform conversions, depending on the flags. Our switch statement's scope
259 // is limited only to specific flags, which are logically OR'd together so as
260 // to handle multi-purposed bitfield options in an elegant fashion since each
261 // of these particular flags is exclusive.
262 // --------------------------------------------------------------------------
263 switch (flags & (HOSTNAME_UTF8 | HOSTNAME_XN)) {
264 case HOSTNAME_UTF8: // Convert to UTF8
265 if (data.starts_with("xn--")) { // Label is in punycode format, so it doesn't yet qualify as being in UTF-8 format
267 idn2rc = idn2_to_unicode_8z8z(data.data(), &p, 0);
268 if (idn2rc == IDN2_OK) _label = std::string((char*)p);
269 if (p != nullptr) idn2_free(p);
272 } // -x- if ^xn-- -x-
274 case HOSTNAME_XN: // Convert to punycode
275 if (!data.starts_with("xn--")) { // Label is not already in punycode format
277 idn2rc = idn2_to_ascii_8z(data.data(), &p, 0);
278 if (idn2rc == IDN2_OK) _label = std::string((char*)p);
279 if (p != nullptr) idn2_free(p);
282 } // -x- if !xn-- -x-
284 default: // No conversions necessary, so just assign the label as is
286 } // -x- switch flags -x-
288 // --------------------------------------------------------------------------
289 // Insert label's length (maximum 255 characters) if it is to be converted to
291 // --------------------------------------------------------------------------
292 if (flags & HOSTNAME_DNS_RR)
293 _label.insert(0, // Position: before first character
294 1, // Insert only one character
295 (u_char)(std::min(_label.size(), (size_t)255))); // Convert size to u_char, limit to 255
297 // --------------------------------------------------------------------------
299 // --------------------------------------------------------------------------
302 } // -x- std::string get -x-
304 /*======================================================================*//**
306 Built-in comparison operator used by @c std::set for ordering rlabel
307 objects by the underlying label data.
308 @returns Underlying rsocket_label
309 *///=========================================================================
311 /// This rlabel structure
312 const rlabel& rhl) const {
313 return data < rhl.data;
316 }; // -x- struct rlabel -x-
318}; // -x- namespace randolf -x-