Última atividade 1 month ago

C++ Implementation of golang URL package - I did that for a project, and I decided to save it here.

Revisão bb021845b6cc9c6c27c8e19cb501380a7be049d9

url.cpp Bruto
1#include "common/url.h"
2
3#include <fmt/format.h>
4
5#include <cctype>
6
7using namespace ProtoRock::Http;
8
9enum EncodingMode {
10 encodePath = 1,
11 encodePathSegment,
12 encodeHost,
13 encodeZone,
14 encodeUserPassword,
15 encodeQueryComponent,
16 encodeFragment,
17};
18
19char unhex(char c) {
20 if ('0' <= c && c <= '9') {
21 return c - '0';
22 }
23 if ('a' <= c && c <= 'f') {
24 return c - 'a' + 10;
25 }
26 if ('A' <= c && c <= 'F') {
27 return c - 'A' + 10;
28 }
29 return 0;
30}
31
32const char *upperhex = "0123456789ABCDEF";
33
34// Return true if the specified character should be escaped when
35// appearing in a URL string, according to RFC 3986.
36//
37// Please be informed that for now shouldEscape does not check all
38// reserved characters correctly. See golang.org/issue/5684.
39bool shouldEscape(char c, EncodingMode mode) {
40 // §2.3 Unreserved characters (alphanum)
41 if ('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9') {
42 return false;
43 }
44 if (mode == encodeHost || mode == encodeZone) {
45 // §3.2.2 Host allows
46 // sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="
47 // as part of reg-name.
48 // We add : because we include :port as part of host.
49 // We add [ ] because we include [ipv6]:port as part of host.
50 // We add < > because they're the only characters left that
51 // we could possibly allow, and Parse will reject them if we
52 // escape them (because hosts can't use %-encoding for
53 // ASCII bytes).
54 switch (c) {
55 case '!':
56 case '$':
57 case '&':
58 case '\'':
59 case '(':
60 case ')':
61 case '*':
62 case '+':
63 case ',':
64 case ';':
65 case '=':
66 case ':':
67 case '[':
68 case ']':
69 case '<':
70 case '>':
71 case '"':
72 return false;
73 }
74 }
75
76 switch (c) {
77 // §2.3 Unreserved characters (mark)
78 case '-':
79 case '_':
80 case '.':
81 case '~':
82 return false;
83
84 // §2.2 Reserved characters (reserved)
85 case '$':
86 case '&':
87 case '+':
88 case ',':
89 case '/':
90 case ':':
91 case ';':
92 case '=':
93 case '?':
94 case '@':
95 // Different sections of the URL allow a few of
96 // the reserved characters to appear unescaped.
97 switch (mode) {
98 case encodePath: // §3.3
99 // The RFC allows : @ & = + $ but saves / ; , for assigning
100 // meaning to individual path segments. This package
101 // only manipulates the path as a whole, so we allow those
102 // last three as well. That leaves only ? to escape.
103 return c == '?';
104
105 case encodePathSegment: // §3.3
106 // The RFC allows : @ & = + $ but saves / ; , for assigning
107 // meaning to individual path segments.
108 return c == '/' || c == ';' || c == ',' || c == '?';
109
110 case encodeUserPassword: // §3.2.1
111 // The RFC allows ';', ':', '&', '=', '+', '$', and ',' in
112 // userinfo, so we must escape only '@', '/', and '?'.
113 // The parsing of userinfo treats ':' as special so we must escape
114 // that too.
115 return c == '@' || c == '/' || c == '?' || c == ':';
116
117 case encodeQueryComponent: // §3.4
118 // The RFC reserves (so we must escape) everything.
119 return true;
120
121 case encodeFragment: // §4.1
122 // The RFC text is silent but the grammar allows
123 // everything: case so escape nothing.
124 return false;
125 }
126 }
127
128 if (mode == encodeFragment) {
129 // RFC 3986 §2.2 allows not escaping sub-delims. A subset of sub-delims are
130 // included in reserved from RFC 2396 §2.2. The remaining sub-delims do not
131 // need to be escaped. To minimize potential breakage, we apply two restrictions:
132 // (1) we always escape sub-delims outside of the fragment, and (2) we always
133 // escape single quote to avoid breaking callers that had previously assumed that
134 // single quotes would be escaped. See issue #19917.
135 switch (c) {
136 case '!':
137 case '(':
138 case ')':
139 case '*':
140 return false;
141 }
142 }
143
144 return true;
145}
146
147std::string escape(const std::string &s, EncodingMode mode) {
148 auto spaceCount = 0;
149 auto hexCount = 0;
150
151 for (auto i = 0; i < s.size(); i++) {
152 auto c = s[i];
153 if (shouldEscape(c, mode)) {
154 if (c == ' ' && mode == encodeQueryComponent) {
155 spaceCount++;
156 } else {
157 hexCount++;
158 }
159 }
160 }
161
162 if (spaceCount == 0 && hexCount == 0) {
163 return s;
164 }
165
166 auto required = s.size() + 2 * hexCount;
167 auto t = std::vector<char>();
168 t.reserve(required);
169
170 if (hexCount == 0) {
171 t.insert(t.begin(), s.begin(), s.end());
172 for (auto i = 0; i < s.size(); i++) {
173 if (s[i] == ' ') {
174 t[i] = '+';
175 }
176 }
177 return std::string(t.begin(), t.end());
178 }
179
180 auto j = 0;
181 auto c = 0;
182 for (auto i = 0; i < s.size(); i++) {
183 auto c = s[i];
184 if (c == ' ' && mode == encodeQueryComponent) {
185 t[j] = '+';
186 j++;
187 } else if (shouldEscape(c, mode)) {
188 t[j] = '%';
189 t[j + 1] = upperhex[c >> 4];
190 t[j + 2] = upperhex[c & 15];
191 j += 3;
192 } else {
193 t[j] = s[i];
194 j++;
195 }
196 }
197
198 return std::string(t.begin(), t.end());
199}
200
201std::string unescape(std::string s, EncodingMode mode) {
202 // Count %, check that they're well-formed.
203 auto n = 0;
204 auto hasPlus = false;
205 auto tmp = std::string();
206 auto v = 0;
207 for (int i = 0; i < s.size();) {
208 switch (s[i]) {
209 case '%':
210 n++;
211 if (i + 2 >= s.size() || !std::isxdigit(s[i + 1]) || !std::isxdigit(s[i + 2])) {
212 s = std::string(s.begin() + 1, s.end());
213 if (s.size() > 3) {
214 s = std::string(s.begin(), s.begin() + 3);
215 }
216 throw std::invalid_argument("escape error: " + s);
217 }
218 // Per https://tools.ietf.org/html/rfc3986#page-21
219 // in the host component %-encoding can only be used
220 // for non-ASCII bytes.
221 // But https://tools.ietf.org/html/rfc6874#section-2
222 // introduces %25 being allowed to escape a percent sign
223 // in IPv6 scoped-address literals. Yay.
224 tmp = std::string(s.begin() + i, s.begin() + i + 3);
225 if (mode == encodeHost && unhex(s[i + 1]) < 8 && tmp != "%25") {
226 throw std::invalid_argument("escape error: " + tmp);
227 }
228
229 if (mode == encodeZone) {
230 // RFC 6874 says basically "anything goes" for zone identifiers
231 // and that even non-ASCII can be redundantly escaped,
232 // but it seems prudent to restrict %-escaped bytes here to those
233 // that are valid host name bytes in their unescaped form.
234 // That is, you can use escaping in the zone identifier but not
235 // to introduce bytes you couldn't just write directly.
236 // But Windows puts spaces here! Yay.
237 v = unhex(s[i + 1]) << 4 | unhex(s[i + 2]);
238 tmp = std::string(s.begin() + i, s.begin() + i + 3);
239 if (tmp != "%25" && v != ' ' && shouldEscape(v, encodeHost)) {
240 throw std::invalid_argument("escape error: " + tmp);
241 }
242 }
243 i += 3;
244 break;
245 case '+':
246 hasPlus = mode == encodeQueryComponent;
247 i++;
248 break;
249 default:
250 if ((mode == encodeHost || mode == encodeZone) && (uint8_t)s[i] < 0x80 && shouldEscape(s[i], mode)) {
251 tmp = std::string(s.begin() + i, s.begin() + i + 1);
252 throw std::invalid_argument("invalid host: " + tmp);
253 }
254 i++;
255 }
256 }
257
258 if (n == 0 && !hasPlus) {
259 return s;
260 }
261
262 auto ss = std::stringstream();
263 for (int i = 0; i < s.size(); i++) {
264 switch (s[i]) {
265 case '%':
266 ss << (char)(unhex(s[i + 1]) << 4 | unhex(s[i + 2]));
267 i += 2;
268 break;
269 case '+':
270 ss << ((mode == encodeQueryComponent) ? ' ' : '+');
271 break;
272 default:
273 ss << s[i];
274 }
275 }
276 return ss.str();
277}
278
279bool stringContainsCTLByte(const std::string &s) {
280 for (auto c : s) {
281 if (c < ' ' || c == 0x7f) {
282 return true;
283 }
284 }
285 return false;
286}
287
288std::string getScheme(std::string &url) {
289 int i = 0;
290 std::string scheme;
291 for (auto &c : url) {
292 if (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z')) {
293 // Do nothing
294 } else if (('0' <= c && c <= '9') || c == '+' || c == '-' || c == '.') {
295 if (i == 0) {
296 break;
297 }
298 } else if (c == ':') {
299 scheme = std::string(url.begin(), url.begin() + i);
300 url = std::string(url.begin() + i + 1, url.end());
301 break;
302 } else {
303 // we have encountered an invalid character,
304 // so there is no valid scheme
305 break;
306 }
307 i++;
308 }
309 return scheme;
310}
311
312// validUserinfo reports whether s is a valid userinfo string per RFC 3986
313// Section 3.2.1:
314// userinfo = *( unreserved / pct-encoded / sub-delims / ":" )
315// unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
316// sub-delims = "!" / "$" / "&" / "'" / "(" / ")"
317// / "*" / "+" / "," / ";" / "="
318//
319// It doesn't validate pct-encoded. The caller does that via func unescape.
320bool validUserinfo(const std::string &s) {
321 for (auto r : s) {
322 if ('A' <= r && r <= 'Z') {
323 continue;
324 }
325 if ('a' <= r && r <= 'z') {
326 continue;
327 }
328 if ('0' <= r && r <= '9') {
329 continue;
330 }
331 switch (r) {
332 case '-':
333 case '.':
334 case '_':
335 case ':':
336 case '~':
337 case '!':
338 case '$':
339 case '&':
340 case '\'':
341 case '(':
342 case ')':
343 case '*':
344 case '+':
345 case ',':
346 case ';':
347 case '=':
348 case '%':
349 case '@':
350 continue;
351 default:
352 return false;
353 }
354 }
355 return true;
356}
357
358// validOptionalPort reports whether port is either an empty string
359// or matches /^:\d*$/
360bool validOptionalPort(const std::string &port) {
361 if (port.empty()) {
362 return true;
363 }
364 if (port[0] != ':') {
365 return false;
366 }
367 for (auto b = port.begin() + 1; b < port.end(); b++) {
368 if (*b < '0' || *b > '9') {
369 return false;
370 }
371 }
372 return true;
373}
374
375// parseHost parses host as an authority without user
376// information. That is, as host[:port].
377std::string parseHost(const std::string &host) {
378 int idx;
379 if (!host.empty() && host[0] == '[') {
380 // Parse an IP-Literal in RFC 3986 and RFC 6874.
381 // E.g., "[fe80::1]", "[fe80::1%25en0]", "[fe80::1]:80".
382 idx = host.find_last_of(']');
383 if (idx >= host.size()) {
384 throw std::invalid_argument("cannot find ']' in host");
385 }
386 auto colonPort = std::string(host.begin() + idx + 1, host.end());
387 if (!validOptionalPort(colonPort)) {
388 throw std::invalid_argument(fmt::format("invalid port {} after host", colonPort));
389 }
390 // RFC 6874 defines that %25 (%-encoded percent) introduces
391 // the zone identifier, and the zone identifier can use basically
392 // any %-encoding it likes. That's different from the host, which
393 // can only %-encode non-ASCII bytes.
394 // We do impose some restrictions on the zone, to avoid stupidity
395 // like newlines.
396 auto zone = host.find("%25");
397 if (idx != std::string::npos) {
398 auto host1 = unescape(std::string(host.begin(), host.begin() + zone), encodeHost);
399 auto host2 = unescape(std::string(host.begin() + zone, host.begin() + idx), encodeHost);
400 auto host3 = unescape(std::string(host.begin() + idx, host.end()), encodeZone);
401 return host1 + host2 + host3;
402 }
403 } else if ((idx = host.find_last_of(':')) < host.size()) {
404 auto colonPort = std::string(host.begin() + idx, host.end());
405 if (!validOptionalPort(colonPort)) {
406 throw std::invalid_argument(fmt::format("invalid port {} after host", colonPort));
407 }
408 }
409
410 return unescape(host, encodeHost);
411}
412
413void parseAuthority(const std::string &authority, UserInfo &ui, std::string &host) {
414 auto i = authority.find_last_of('@');
415 if (i > authority.size()) {
416 host = parseHost(authority);
417 } else {
418 host = parseHost(std::string(authority.begin() + i + 1, authority.end()));
419 }
420
421 if (i > authority.size()) {
422 return;
423 }
424
425 auto userInfo = std::string(authority.begin(), authority.begin() + i);
426 if (!validUserinfo(userInfo)) {
427 throw std::invalid_argument("invalid userinfo");
428 }
429 auto idx = userInfo.find(':');
430 if (idx == std::string::npos) {
431 userInfo = unescape(userInfo, encodeUserPassword);
432 ui = UserInfo(userInfo);
433 } else {
434 auto username = std::string(userInfo.begin(), userInfo.begin() + idx);
435 auto password = std::string(userInfo.begin() + idx, userInfo.end());
436 ui.Username = unescape(username, encodeUserPassword);
437 ui.Password = unescape(username, encodeUserPassword);
438 }
439}
440
441URL URL::Parse(std::string url) {
442 URL u;
443 std::string frag;
444 auto hashIndex = url.find("#");
445 if (hashIndex != std::string::npos) {
446 frag = std::string(url.begin() + hashIndex, url.end());
447 url = std::string(url.begin(), url.begin() + hashIndex);
448 }
449
450 u.setFragment(frag);
451
452 if (stringContainsCTLByte(url)) {
453 throw std::invalid_argument("invalid url: string contains control bytes");
454 }
455
456 if (url == "*") {
457 u.Path = "*";
458 return u;
459 }
460 auto rest = url;
461
462 u.Scheme = getScheme(rest);
463 std::transform(u.Scheme.begin(), u.Scheme.end(), u.Scheme.begin(), [](unsigned char c) -> unsigned char { return std::tolower(c); });
464
465 if (!rest.empty() > 0 && rest[rest.size() - 1] == '?') {
466 u.ForceQuery = true;
467 rest.pop_back();
468 } else {
469 auto idx = rest.find("?");
470 if (idx != std::string::npos) {
471 u.RawQuery = std::string(rest.begin() + idx, rest.end());
472 rest = std::string(rest.begin(), rest.begin() + idx);
473 }
474 }
475
476 if (!rest.empty() && rest[0] != '/') {
477 if (!u.Scheme.empty()) {
478 // We consider rootless paths per RFC 3986 as opaque.
479 u.Opaque = rest;
480 return u;
481 }
482 }
483
484 if (!u.Scheme.empty() || (rest.find("///") != 0 && rest.find("//") == 0)) {
485 auto authority = std::string(rest.begin() + 2, rest.end());
486 rest = "";
487 int i = authority.find("/");
488 if (i != std::string::npos) {
489 rest = std::string(authority.begin() + i, authority.end());
490 authority = std::string(authority.begin(), authority.begin() + i);
491 }
492 parseAuthority(authority, u.User, u.Host);
493 }
494
495 u.setPath(rest);
496
497 return u;
498}
499
500void URL::setFragment(const std::string &f) {
501 Fragment = unescape(f, encodeFragment);
502 auto escf = escape(Fragment, encodeFragment);
503 RawFragment = (escf == f) ? "" : f;
504}
505
506void URL::setPath(const std::string &p) {
507 Path = unescape(p, encodePath);
508 auto escp = escape(Path, encodePath);
509 RawPath = (escp == p) ? "" : p;
510}
511
512std::string URL::PathEscape(const std::string &path) { return escape(path, encodePath); }
513std::string URL::PathUnescape(const std::string &path) { return unescape(path, encodePath); }
514std::string URL::QueryEscape(const std::string &query) { return escape(query, encodeQueryComponent); }
515std::string URL::QueryUnescape(const std::string &query) { return unescape(query, encodeQueryComponent); }
url.h Bruto
1#pragma once
2
3#include <algorithm>
4#include <map>
5#include <stdexcept>
6
7#include "common/common.h"
8
9// Based on Golang Implementation
10// MIT
11
12namespace ProtoRock {
13namespace Http {
14struct UserInfo {
15 std::string Username;
16 std::string Password;
17
18 UserInfo() {}
19 UserInfo(const std::string &u) : Username(u){};
20};
21
22struct URL {
23 private:
24 void setFragment(const std::string &);
25 void setPath(const std::string &);
26
27 public:
28 std::string Scheme;
29 // encoded opaque data
30 std::string Opaque;
31 // username and password information
32 UserInfo User;
33 // host or host:port
34 std::string Host;
35 // path (relative paths may omit leading slash)
36 std::string Path;
37 // encoded path hint (see EscapedPath method)
38 std::string RawPath;
39 // append a query ('?') even if RawQuery is empty
40 bool ForceQuery = false;
41 // encoded query values, without '?'
42 std::string RawQuery;
43 // fragment for references, without '#'
44 std::string Fragment;
45 // encoded fragment hint (see EscapedFragment method)
46 std::string RawFragment;
47
48 static URL Parse(std::string url);
49 static std::string PathEscape(const std::string &path);
50 static std::string PathUnescape(const std::string &path);
51 static std::string QueryEscape(const std::string &query);
52 static std::string QueryUnescape(const std::string &query);
53};
54
55} // namespace Http
56} // namespace ProtoRock