randolf.ca  1.00
Randolf Richardson's C++ classes
Loading...
Searching...
No Matches
rlabel
1#pragma once
2
3// #include <randolf/rhostname_flags>
4
5#include <idn2.h> // apt install libidn2-dev
6
7namespace randolf {
8
9 /*======================================================================*//**
10 @brief
11 Internal structure that @ref rhostname uses to store @ref rlabel portions, of
12 which at least one comprises a hostname.
13
14 @note
15 See the `UTF-8 Everywhere` web site for simple, straight-forward, and helpful
16 insights and advice on how to properly work with UTF8 data in C++ (and other
17 languages) at: https://www.utf8everywhere.org/
18 @see rhostname
19 *///=========================================================================
20 struct rlabel {
21 /// Label data
22 std::string data;
23 /// The DNS RR flag indicates that this label is derived from a DNS RR
24 /// wherein the first character indicates the length of the label in bytes)
25 bool dns_rr = false;
26 /// The UTF-8 flag indicates that this label is an internationalized label
27 /// because it contains at least one UTF-8 sequence
28 bool utf8 = false;
29 /// The xn flag indicates that this label seems to be in punycode format,
30 /// because it begins with the sequence "xn--" and is at least 5 bytes long
31 bool xn = false;
32 /// Conversion return code (@c IDN2_OK = successful conversion to UTF8 or
33 /// punycode; or this is the default because no conversion was performed)
34 int idn2rc = IDN2_OK;
35
36 public:
37 /*======================================================================*//**
38 @brief
39 Optional flags that alter, modify, or enhance the operation of hostname
40 handling.
41 *///=========================================================================
42 enum HOSTNAME_FLAGS: int {
43
44 /*----------------------------------------------------------------------*//**
45 The HOSTNAME_DEFAULT flag isn't necessary, but it's included here for
46 completeness as it accomodates programming styles that prefer to emphasize
47 when defaults are being relied upon.
48 *///-------------------------------------------------------------------------
49 HOSTNAME_DEFAULT = 0,
50
51 /*----------------------------------------------------------------------*//**
52 Alternate between an IP address and its IN-ADDR.ARPA formatted counter-part.
53
54 This causes an IP address's octets (IPv4 format) or segments (IPv6 format),
55 when supplied where a hostname would normally be expected, to be effectively
56 reveresed.
57
58 Or, deconvert for output an effectively-reversed IP-address-as-a-hostname in
59 the appropriate IPv4/IPv6 format. (All remaining labels will be removed.)
60
61 @note
62 This format is commonly used to query DNS-based blocklists/blacklists,
63 greylists, whitelists, etc.
64 *///-------------------------------------------------------------------------
65 HOSTNAME_IP_ADDR = 1,
66
67 /*----------------------------------------------------------------------*//**
68 Convert path to hostname. This converts a path, delimited by slashes (as is
69 standard in UNIX, Linux, etc., as well as with internet URIs), to a hostname
70 with the top-level portion corresponding to the top-level label as used in
71 the Domain Name System, the second-level portion with the second-level label,
72 etc.
73
74 For example: The path "/internet/com/example/www/" gets converted into the
75 DNS hostname "www.example.com.internet" (to remove the top/last
76 label, use the @ref remove(-1) method after conversion, or use
77 the @ref path_to_hostname method which provides an additional
78 parameter to specify how many elements of the path to skip from
79 both the beginning and the end).
80 *///-------------------------------------------------------------------------
81 HOSTNAME_FROM_PATH = 2,
82
83 /*----------------------------------------------------------------------*//**
84 Don't throw exceptions when parsing a hostname string when the format is
85 invalid (or the data is corrupt).
86
87 When an index parameter is out of range, automatically truncate it to the
88 maximum bounds of any underlying vector instead of throwing an exception when
89 an index is out of range.
90
91 This is useful for evaluating a hostname from a data source that is intended
92 to be reported on in a diagnostic fashion, or as part of a debugging effort.
93 *///-------------------------------------------------------------------------
94 HOSTNAME_WITHOUT_EXCEPTIONS = 4,
95
96 /*----------------------------------------------------------------------*//**
97 If hostname is an FQDN, then include the final period at the end (used by the
98 @ref rhostname method, primarily).
99
100 (The "OPT" portion of this flag name means "optional.")
101 *///-------------------------------------------------------------------------
102 HOSTNAME_FQDN_OPT = 8,
103
104 /*----------------------------------------------------------------------*//**
105 Convert label from/to DNS RR format (this is mostly only useful for authors
106 of DNS client or server software, or raw DNS packet analysis code).
107
108 This format is used in DNS packets, and so this flag could be useful in
109 software projects like DNS resolvers, DNS daemons, and DNS packet analyzers.
110 *///-------------------------------------------------------------------------
111 HOSTNAME_DNS_RR = 16,
112
113 /*----------------------------------------------------------------------*//**
114 Convert label from/to normal UTF-8 data.
115
116 @warning
117 The HOSTNAME_UTF8 and HOSTNAME_XN flags represent two exclusively different
118 formats, and combining them in for a hostname (or for an individual label) is
119 likely to yield unpredictable results (the order in which these two flags are
120 tested will vary depending on which blocks of code are doing the processing,
121 and the programming logic being used; and future updates to the code may also
122 affect changes to the order of flag tests and the programming logic).
123
124 This format is normally presented to users to present hostnames natively in
125 different languages.
126 @see HOSTNAME_XN
127 *///-------------------------------------------------------------------------
128 HOSTNAME_UTF8 = 32,
129
130 /*----------------------------------------------------------------------*//**
131 Convert label from/to punycode format wherein the first four characters will
132 begin with the ASCII sequence "xn--" if any UTF-8 sequences are present.
133
134 @warning
135 The HOSTNAME_XN and HOSTNAME_UTF8 flags represent two exclusively different
136 formats, and combining them in for a hostname (or for an individual label) is
137 likely to yield unpredictable results (the order in which these two flags are
138 tested will vary depending on which blocks of code are doing the processing,
139 and the programming logic being used; and future updates to the code may also
140 affect changes to the order of flag tests and the programming logic).
141
142 This format is used by client software (e.g., eMail applicaitons and web
143 browsers) when attempting to resolve the IP addresses of remote hosts or
144 lookup other DNS records.
145 @see HOSTNAME_UTF8
146 *///-------------------------------------------------------------------------
147 HOSTNAME_XN = 64,
148
149 }; // -x- enum HOSTNAME_FLAGS -x-
150
151 /*======================================================================*//**
152 @brief
153 Structure constructor that detects whether any UTF-8 sequence is present in
154 the data and/or if it's already in punycode format (e.g., it begins with the
155 "xn--" sequence).
156
157 Although the default behaviour is to not convert the data between UTF-8 and
158 punycode, either the @ref HOSTNAME_UTF8 or @ref HOSTNAME_XN flag may be
159 specified to cause such a conversion for underlying storage. The benefit of
160 using these flags here is mostly for performance optimization where repeated
161 outbound conversions can be prevented when accessing the data multiple times.
162 @throws std::invalid_argument If the label is blank or there is a problem
163 with the format (e.g., invalid characters)
164 *///=========================================================================
165 rlabel(
166 /// Label data
167 const std::string data,
168 /// @ref HOSTNAME_DNS_RR Convert label from DNS RR format@n
169 /// @ref HOSTNAME_UTF8 Convert label to raw UTF-8 format (optional)@n
170 /// @ref HOSTNAME_XN Convert label to punycode format (optional)
171 const int flags = HOSTNAME_DEFAULT) {
172
173 // --------------------------------------------------------------------------
174 // Syntax checks.
175 // --------------------------------------------------------------------------
176 if (data.empty()) throw std::invalid_argument("rlabel is empty");
177
178 // --------------------------------------------------------------------------
179 // Convert label from DNS RR format where the first byte specifies the length
180 // of the remaining data that follows (the std::min function is used to
181 // prevent a buffer overrun by automatically truncating the data; this will
182 // already have been vetted by rhostname::hostname, so we don't need to do
183 // this again here at this time).
184 // --------------------------------------------------------------------------
185 this->data = (flags & HOSTNAME_DNS_RR) ? data.substr(1, std::min((size_t)((u_char)data.front()), data.size()))
186 : data;
187
188 // --------------------------------------------------------------------------
189 // Peform conversions, depending on the flags. Our switch statement's scope
190 // is limited only to specific flags, which are logically OR'd together so as
191 // to handle multi-purposed bitfield options in an elegant fashion since each
192 // of these particular flags is exclusive.
193 // --------------------------------------------------------------------------
194 switch (flags & (HOSTNAME_UTF8 | HOSTNAME_XN)) {
195 case HOSTNAME_UTF8: // Convert to UTF8
196 if (this->data.starts_with("xn--")) { // Label is in punycode format, so it doesn't yet qualify as being in UTF-8 format
197 char* p = nullptr;
198 idn2rc = idn2_to_unicode_8z8z(this->data.c_str(), &p, 0);
199 if (idn2rc == IDN2_OK) this->data = std::string((char*)p);
200 idn2_free(p);
201 }
202 break;
203 case HOSTNAME_XN: // Convert to punycode
204 if (!this->data.starts_with("xn--")) { // Label is not already in punycode format
205 char* p = nullptr;
206 idn2rc = idn2_to_ascii_8z(this->data.c_str(), &p, 0);
207 if (idn2rc == IDN2_OK) this->data = std::string((char*)p);
208 idn2_free(p);
209 } // -x- if !xn-- -x-
210 break;
211 } // -x- switch flags -x-
212
213 // --------------------------------------------------------------------------
214 // Record whether this label is already in punycode format.
215 //
216 // Note: A 4-byte label of "xn--" is not valid punycode, so we intentionally
217 // check for a "greater than 4" string size (this is not erroneous).
218 // --------------------------------------------------------------------------
219 if (this->data.size() > 4 && this->data.starts_with("xn--")) xn = true;
220
221 // --------------------------------------------------------------------------
222 // Loop through string to determine UTF8 (any character is greater than 127).
223 // --------------------------------------------------------------------------
224 for (int i = this->data.size() - 1; i > 0; i--) {
225 if ((u_char)this->data[i] >= 128) { // UTF8 byte detected
226 utf8 = true;
227 break;
228 } // -x- if utf8 -x-
229 } // -x- for i -x-
230
231//std::cout << "Label: " << this->data << std::endl; // Debug
232//std::cout << " XN: " << xn << std::endl; // Debug
233//std::cout << " UTF8: " << utf8 << std::endl; // Debug
234
235 }; // -x- constructor rlabel -x-
236
237// ~rlabel() {
238// std::cout << "Destructor: rlabel = " << data << std::endl; // Debug
239// }
240
241 /*======================================================================*//**
242 @brief
243 Provice label as an std::string with any needed conversions.
244 @returns Label as an std::string object
245 *///=========================================================================
246 std::string get(
247 /// @ref HOSTNAME_DNS_RR Convert label to DNS RR format@n
248 /// @ref HOSTNAME_UTF8 Convert label to raw UTF-8 format (optional)@n
249 /// @ref HOSTNAME_XN Convert label to punycode format (optional)
250 const int flags = HOSTNAME_DEFAULT) {
251
252 // --------------------------------------------------------------------------
253 // Internal variables.
254 // --------------------------------------------------------------------------
255 std::string _label;
256
257 // --------------------------------------------------------------------------
258 // Peform conversions, depending on the flags. Our switch statement's scope
259 // is limited only to specific flags, which are logically OR'd together so as
260 // to handle multi-purposed bitfield options in an elegant fashion since each
261 // of these particular flags is exclusive.
262 // --------------------------------------------------------------------------
263 switch (flags & (HOSTNAME_UTF8 | HOSTNAME_XN)) {
264 case HOSTNAME_UTF8: // Convert to UTF8
265 if (data.starts_with("xn--")) { // Label is in punycode format, so it doesn't yet qualify as being in UTF-8 format
266 char* p = nullptr;
267 idn2rc = idn2_to_unicode_8z8z(data.c_str(), &p, 0);
268 if (idn2rc == IDN2_OK) _label = std::string((char*)p);
269 idn2_free(p);
270 } else {
271 _label = data;
272 }
273 break;
274 case HOSTNAME_XN: // Convert to punycode
275 if (!data.starts_with("xn--")) { // Label is not already in punycode format
276 char* p = nullptr;
277 idn2rc = idn2_to_ascii_8z(data.c_str(), &p, 0);
278 if (idn2rc == IDN2_OK) _label = std::string((char*)p);
279 idn2_free(p);
280 } else {
281 _label = data;
282 } // -x- if !xn-- -x-
283 break;
284 default: // No conversions necessary, so just assign the label as is
285 _label = data;
286 } // -x- switch flags -x-
287
288 // --------------------------------------------------------------------------
289 // Insert label's length (maximum 255 characters) if it is to be converted to
290 // DNS RR format.
291 // --------------------------------------------------------------------------
292 if (flags & HOSTNAME_DNS_RR)
293 _label.insert(0, // Position: before first character
294 1, // Insert only one character
295 (u_char)(std::min(_label.size(), (size_t)255))); // Convert size to u_char, limit to 255
296
297 // --------------------------------------------------------------------------
298 // Return the label.
299 // --------------------------------------------------------------------------
300 return _label;
301
302 } // -x- std::string get -x-
303
304 /*======================================================================*//**
305 @brief
306 Built-in comparison operator used by @c std::set for ordering rlabel
307 objects by the underlying label data.
308 @returns Underlying rsocket_label
309 *///=========================================================================
310 bool operator<(
311 /// This rlabel structure
312 const rlabel& rhl) const {
313 return data < rhl.data;
314 }; // -x- bool < -x-
315
316 }; // -x- struct rlabel -x-
317
318}; // -x- namespace randolf -x-