randolf.ca  1.00
Randolf Richardson's C++ classes
Loading...
Searching...
No Matches
rlabel
1#pragma once
2
3// #include <randolf/rhostname_flags>
4
5#include <idn2.h> // apt install libidn2-dev
6
7namespace randolf {
8
9 /*======================================================================*//**
10 @brief
11 Internal structure that @ref rhostname uses to store @ref rlabel portions, of
12 which at least one comprises a hostname.
13
14 @note
15 See the `UTF-8 Everywhere` web site for simple, straight-forward, and helpful
16 insights and advice on how to properly work with UTF8 data in C++ (and other
17 languages) at: https://www.utf8everywhere.org/
18
19 @author Randolf Richardson
20 @version 1.00
21 @par History
22 - 2023-Apr-27 v1.00 Initial version
23 - 2025-Feb-03 v1.00 Increased use of references and pointers
24
25 @see rhostname
26 *///=========================================================================
27 struct rlabel {
28 /// Label data
29 std::string data;
30 /// The DNS RR flag indicates that this label is derived from a DNS RR
31 /// wherein the first character indicates the length of the label in bytes)
32 bool dns_rr = false;
33 /// The UTF-8 flag indicates that this label is an internationalized label
34 /// because it contains at least one UTF-8 sequence
35 bool utf8 = false;
36 /// The xn flag indicates that this label seems to be in punycode format,
37 /// because it begins with the sequence "xn--" and is at least 5 bytes long
38 bool xn = false;
39 /// Conversion return code (@c IDN2_OK = successful conversion to UTF8 or
40 /// punycode; or this is the default because no conversion was performed)
41 int idn2rc = IDN2_OK;
42
43 public:
44 /*======================================================================*//**
45 @brief
46 Optional flags that alter, modify, or enhance the operation of hostname
47 handling.
48 *///=========================================================================
49 enum HOSTNAME_FLAGS: int {
50
51 /*----------------------------------------------------------------------*//**
52 The HOSTNAME_DEFAULT flag isn't necessary, but it's included here for
53 completeness as it accomodates programming styles that prefer to emphasize
54 when defaults are being relied upon.
55 *///-------------------------------------------------------------------------
56 HOSTNAME_DEFAULT = 0,
57
58 /*----------------------------------------------------------------------*//**
59 Alternate between an IP address and its IN-ADDR.ARPA formatted counter-part.
60
61 This causes an IP address's octets (IPv4 format) or segments (IPv6 format),
62 when supplied where a hostname would normally be expected, to be effectively
63 reveresed.
64
65 Or, deconvert for output an effectively-reversed IP-address-as-a-hostname in
66 the appropriate IPv4/IPv6 format. (All remaining labels will be removed.)
67
68 @note
69 This format is commonly used to query DNS-based blocklists/blacklists,
70 greylists, whitelists, etc.
71 *///-------------------------------------------------------------------------
72 HOSTNAME_IP_ADDR = 1,
73
74 /*----------------------------------------------------------------------*//**
75 Convert path to hostname. This converts a path, delimited by slashes (as is
76 standard in UNIX, Linux, etc., as well as with internet URIs), to a hostname
77 with the top-level portion corresponding to the top-level label as used in
78 the Domain Name System, the second-level portion with the second-level label,
79 etc.
80
81 For example: The path "/internet/com/example/www/" gets converted into the
82 DNS hostname "www.example.com.internet" (to remove the top/last
83 label, use the @c remove(-1) method after conversion, or use
84 the @c path_to_hostname method which provides an additional
85 parameter to specify how many elements of the path to skip from
86 both the beginning and the end).
87 *///-------------------------------------------------------------------------
88 HOSTNAME_FROM_PATH = 2,
89
90 /*----------------------------------------------------------------------*//**
91 Don't throw exceptions when parsing a hostname string when the format is
92 invalid (or the data is corrupt).
93
94 When an index parameter is out of range, automatically truncate it to the
95 maximum bounds of any underlying vector instead of throwing an exception when
96 an index is out of range.
97
98 This is useful for evaluating a hostname from a data source that is intended
99 to be reported on in a diagnostic fashion, or as part of a debugging effort.
100 *///-------------------------------------------------------------------------
101 HOSTNAME_WITHOUT_EXCEPTIONS = 4,
102
103 /*----------------------------------------------------------------------*//**
104 If hostname is an FQDN, then include the final period at the end (used by the
105 @ref rhostname method, primarily).
106
107 (The "OPT" portion of this flag name means "optional.")
108 *///-------------------------------------------------------------------------
109 HOSTNAME_FQDN_OPT = 8,
110
111 /*----------------------------------------------------------------------*//**
112 Convert label from/to DNS RR format (this is mostly only useful for authors
113 of DNS client or server software, or raw DNS packet analysis code).
114
115 This format is used in DNS packets, and so this flag could be useful in
116 software projects like DNS resolvers, DNS daemons, and DNS packet analyzers.
117 *///-------------------------------------------------------------------------
118 HOSTNAME_DNS_RR = 16,
119
120 /*----------------------------------------------------------------------*//**
121 Convert label from/to normal UTF-8 data.
122
123 @warning
124 The HOSTNAME_UTF8 and HOSTNAME_XN flags represent two exclusively different
125 formats, and combining them in for a hostname (or for an individual label) is
126 likely to yield unpredictable results (the order in which these two flags are
127 tested will vary depending on which blocks of code are doing the processing,
128 and the programming logic being used; and future updates to the code may also
129 affect changes to the order of flag tests and the programming logic).
130
131 This format is normally presented to users to present hostnames natively in
132 different languages.
133 @see HOSTNAME_XN
134 *///-------------------------------------------------------------------------
135 HOSTNAME_UTF8 = 32,
136
137 /*----------------------------------------------------------------------*//**
138 Convert label from/to punycode format wherein the first four characters will
139 begin with the ASCII sequence "xn--" if any UTF-8 sequences are present.
140
141 @warning
142 The HOSTNAME_XN and HOSTNAME_UTF8 flags represent two exclusively different
143 formats, and combining them in for a hostname (or for an individual label) is
144 likely to yield unpredictable results (the order in which these two flags are
145 tested will vary depending on which blocks of code are doing the processing,
146 and the programming logic being used; and future updates to the code may also
147 affect changes to the order of flag tests and the programming logic).
148
149 This format is used by client software (e.g., eMail applicaitons and web
150 browsers) when attempting to resolve the IP addresses of remote hosts or
151 lookup other DNS records.
152 @see HOSTNAME_UTF8
153 *///-------------------------------------------------------------------------
154 HOSTNAME_XN = 64,
155
156 }; // -x- enum HOSTNAME_FLAGS -x-
157
158 /*======================================================================*//**
159 @brief
160 Structure constructor that detects whether any UTF-8 sequence is present in
161 the data and/or if it's already in punycode format (e.g., it begins with the
162 "xn--" sequence).
163
164 Although the default behaviour is to not convert the data between UTF-8 and
165 punycode, either the @ref HOSTNAME_UTF8 or @ref HOSTNAME_XN flag may be
166 specified to cause such a conversion for underlying storage. The benefit of
167 using these flags here is mostly for performance optimization where repeated
168 outbound conversions can be prevented when accessing the data multiple times.
169 @throws std::invalid_argument If the label is blank or there is a problem
170 with the format (e.g., invalid characters)
171 *///=========================================================================
172 rlabel(
173 /// Label data
174 const std::string& data,
175 /// @ref HOSTNAME_DNS_RR Convert label from DNS RR format@n
176 /// @ref HOSTNAME_UTF8 Convert label to raw UTF-8 format (optional)@n
177 /// @ref HOSTNAME_XN Convert label to punycode format (optional)
178 const int flags = HOSTNAME_DEFAULT) {
179
180 // --------------------------------------------------------------------------
181 // Syntax checks.
182 // --------------------------------------------------------------------------
183 if (data.empty()) throw std::invalid_argument("rlabel is empty");
184
185 // --------------------------------------------------------------------------
186 // Convert label from DNS RR format where the first byte specifies the length
187 // of the remaining data that follows (the std::min function is used to
188 // prevent a buffer overrun by automatically truncating the data; this will
189 // already have been vetted by rhostname::hostname, so we don't need to do
190 // this again here at this time).
191 // --------------------------------------------------------------------------
192 this->data = (flags & HOSTNAME_DNS_RR) ? data.substr(1, std::min((size_t)((u_char)data.front()), data.size()))
193 : data;
194
195 // --------------------------------------------------------------------------
196 // Peform conversions, depending on the flags. Our switch statement's scope
197 // is limited only to specific flags, which are logically OR'd together so as
198 // to handle multi-purposed bitfield options in an elegant fashion since each
199 // of these particular flags is exclusive.
200 // --------------------------------------------------------------------------
201 switch (flags & (HOSTNAME_UTF8 | HOSTNAME_XN)) {
202 case HOSTNAME_UTF8: // Convert to UTF8
203 if (this->data.starts_with("xn--")) { // Label is in punycode format, so it doesn't yet qualify as being in UTF-8 format
204 char* p = nullptr;
205 idn2rc = idn2_to_unicode_8z8z(this->data.data(), &p, 0);
206 if (idn2rc == IDN2_OK) this->data = std::string((char*)p);
207 if (p != nullptr) idn2_free(p);
208 } // -x- if ^xn-- -x-
209 break;
210 case HOSTNAME_XN: // Convert to punycode
211 if (!this->data.starts_with("xn--")) { // Label is not already in punycode format
212 char* p = nullptr;
213 idn2rc = idn2_to_ascii_8z(this->data.data(), &p, 0);
214 if (idn2rc == IDN2_OK) this->data = std::string((char*)p);
215 if (p != nullptr) idn2_free(p);
216 } // -x- if !xn-- -x-
217 break;
218 } // -x- switch flags -x-
219
220 // --------------------------------------------------------------------------
221 // Record whether this label is already in punycode format.
222 //
223 // Note: A 4-byte label of "xn--" is not valid punycode, so we intentionally
224 // check for a "greater than 4" string size (this is not erroneous).
225 // --------------------------------------------------------------------------
226 if (this->data.size() > 4 && this->data.starts_with("xn--")) xn = true;
227
228 // --------------------------------------------------------------------------
229 // Loop through string to determine UTF8 (any character is greater than 127).
230 // --------------------------------------------------------------------------
231 for (int i = this->data.size() - 1; i > 0; i--) {
232 if ((u_char)this->data[i] >= 128) { // UTF8 byte detected
233 utf8 = true;
234 break;
235 } // -x- if utf8 -x-
236 } // -x- for i -x-
237
238//std::cout << "Label: " << this->data << std::endl; // Debug
239//std::cout << " XN: " << xn << std::endl; // Debug
240//std::cout << " UTF8: " << utf8 << std::endl; // Debug
241
242 } // -x- constructor rlabel -x-
243
244// ~rlabel() {
245// std::cout << "Destructor: rlabel = " << data << std::endl; // Debug
246// }
247
248 /*======================================================================*//**
249 @brief
250 Provice label as an std::string with any needed conversions.
251 @returns Label as an std::string object
252 *///=========================================================================
253 std::string get(
254 /// @ref HOSTNAME_DNS_RR Convert label to DNS RR format@n
255 /// @ref HOSTNAME_UTF8 Convert label to raw UTF-8 format (optional)@n
256 /// @ref HOSTNAME_XN Convert label to punycode format (optional)
257 const int flags = HOSTNAME_DEFAULT) {
258
259 // --------------------------------------------------------------------------
260 // Internal variables.
261 // --------------------------------------------------------------------------
262 std::string _label;
263
264 // --------------------------------------------------------------------------
265 // Peform conversions, depending on the flags. Our switch statement's scope
266 // is limited only to specific flags, which are logically OR'd together so as
267 // to handle multi-purposed bitfield options in an elegant fashion since each
268 // of these particular flags is exclusive.
269 // --------------------------------------------------------------------------
270 switch (flags & (HOSTNAME_UTF8 | HOSTNAME_XN)) {
271 case HOSTNAME_UTF8: // Convert to UTF8
272 if (data.starts_with("xn--")) { // Label is in punycode format, so it doesn't yet qualify as being in UTF-8 format
273 char* p = nullptr;
274 idn2rc = idn2_to_unicode_8z8z(data.data(), &p, 0);
275 if (idn2rc == IDN2_OK) _label.assign((char*)p);
276 if (p != nullptr) idn2_free(p);
277 } else {
278 _label.assign(data);
279 } // -x- if ^xn-- -x-
280 break;
281 case HOSTNAME_XN: // Convert to punycode
282 if (!data.starts_with("xn--")) { // Label is not already in punycode format
283 char* p = nullptr;
284 idn2rc = idn2_to_ascii_8z(data.data(), &p, 0);
285 if (idn2rc == IDN2_OK) _label.assign((char*)p);
286 if (p != nullptr) idn2_free(p);
287 } else {
288 _label.assign(data);
289 } // -x- if !xn-- -x-
290 break;
291 default: // No conversions necessary, so just assign the label as is
292 _label.assign(data);
293 } // -x- switch flags -x-
294
295 // --------------------------------------------------------------------------
296 // Insert label's length (maximum 255 characters) if it is to be converted to
297 // DNS RR format.
298 // --------------------------------------------------------------------------
299 if (flags & HOSTNAME_DNS_RR)
300 _label.insert(0, // Position: before first character
301 1, // Insert only one character
302 (u_char)(std::min(_label.size(), (size_t)255))); // Convert size to u_char, limit to 255
303
304 // --------------------------------------------------------------------------
305 // Return the label.
306 // --------------------------------------------------------------------------
307 return _label;
308
309 } // -x- std::string get -x-
310
311 /*======================================================================*//**
312 @brief
313 Built-in comparison operator used by @c std::set for ordering rlabel
314 objects by the underlying label data.
315 @returns Underlying rsocket_label
316 *///=========================================================================
317 bool operator<(
318 /// This rlabel structure
319 const rlabel& rhl) const {
320 return data < rhl.data;
321 } // -x- bool < -x-
322
323 }; // -x- struct rlabel -x-
324
325}; // -x- namespace randolf -x-